michael@0: /*
michael@0: *******************************************************************************
michael@0: * Copyright (C) 2013, International Business Machines
michael@0: * Corporation and others.  All Rights Reserved.
michael@0: *******************************************************************************
michael@0: * dictionarydata.h
michael@0: *
michael@0: * created on: 2012may31
michael@0: * created by: Markus W. Scherer & Maxime Serrano
michael@0: */
michael@0: 
michael@0: #include "dictionarydata.h"
michael@0: #include "unicode/ucharstrie.h"
michael@0: #include "unicode/bytestrie.h"
michael@0: #include "unicode/udata.h"
michael@0: #include "cmemory.h"
michael@0: 
michael@0: #if !UCONFIG_NO_BREAK_ITERATION
michael@0: 
michael@0: U_NAMESPACE_BEGIN
michael@0: 
michael@0: const int32_t  DictionaryData::TRIE_TYPE_BYTES = 0;
michael@0: const int32_t  DictionaryData::TRIE_TYPE_UCHARS = 1;
michael@0: const int32_t  DictionaryData::TRIE_TYPE_MASK = 7;
michael@0: const int32_t  DictionaryData::TRIE_HAS_VALUES = 8;
michael@0: 
michael@0: const int32_t  DictionaryData::TRANSFORM_NONE = 0;
michael@0: const int32_t  DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
michael@0: const int32_t  DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
michael@0: const int32_t  DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
michael@0:     
michael@0: DictionaryMatcher::~DictionaryMatcher() {
michael@0: }
michael@0: 
michael@0: UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
michael@0:     udata_close(file);
michael@0: }
michael@0: 
michael@0: int32_t UCharsDictionaryMatcher::getType() const {
michael@0:     return DictionaryData::TRIE_TYPE_UCHARS;
michael@0: }
michael@0: 
michael@0: int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
michael@0:     UCharsTrie uct(characters);
michael@0:     UChar32 c = utext_next32(text);
michael@0:     if (c < 0) {
michael@0:         return 0;
michael@0:     }
michael@0:     UStringTrieResult result = uct.first(c);
michael@0:     int32_t numChars = 1;
michael@0:     count = 0;
michael@0:     for (;;) {
michael@0:         if (USTRINGTRIE_HAS_VALUE(result)) {
michael@0:             if (count < limit) {
michael@0:                 if (values != NULL) {
michael@0:                     values[count] = uct.getValue();
michael@0:                 }
michael@0:                 lengths[count++] = numChars;
michael@0:             }
michael@0:             if (result == USTRINGTRIE_FINAL_VALUE) {
michael@0:                 break;
michael@0:             }
michael@0:         }
michael@0:         else if (result == USTRINGTRIE_NO_MATCH) {
michael@0:             break;
michael@0:         }
michael@0: 
michael@0:         // TODO: why do we have a text limit if the UText knows its length?
michael@0:         if (numChars >= maxLength) {
michael@0:             break;
michael@0:         }
michael@0: 
michael@0:         c = utext_next32(text);
michael@0:         if (c < 0) {
michael@0:             break;
michael@0:         }
michael@0:         ++numChars;
michael@0:         result = uct.next(c);
michael@0:     }
michael@0:     return numChars;
michael@0: }
michael@0: 
michael@0: BytesDictionaryMatcher::~BytesDictionaryMatcher() {
michael@0:     udata_close(file);
michael@0: }
michael@0: 
michael@0: UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
michael@0:     if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
michael@0:         if (c == 0x200D) {
michael@0:             return 0xFF;
michael@0:         } else if (c == 0x200C) {
michael@0:             return 0xFE;
michael@0:         }
michael@0:         int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
michael@0:         if (delta < 0 || 0xFD < delta) {
michael@0:             return U_SENTINEL;
michael@0:         }
michael@0:         return (UChar32)delta;
michael@0:     }
michael@0:     return c;
michael@0: }
michael@0: 
michael@0: int32_t BytesDictionaryMatcher::getType() const {
michael@0:     return DictionaryData::TRIE_TYPE_BYTES;
michael@0: }
michael@0: 
michael@0: int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
michael@0:     BytesTrie bt(characters);
michael@0:     UChar32 c = utext_next32(text);
michael@0:     if (c < 0) {
michael@0:         return 0;
michael@0:     }
michael@0:     UStringTrieResult result = bt.first(transform(c));
michael@0:     int32_t numChars = 1;
michael@0:     count = 0;
michael@0:     for (;;) {
michael@0:         if (USTRINGTRIE_HAS_VALUE(result)) {
michael@0:             if (count < limit) {
michael@0:                 if (values != NULL) {
michael@0:                     values[count] = bt.getValue();
michael@0:             }
michael@0:                 lengths[count++] = numChars;
michael@0:             }
michael@0:             if (result == USTRINGTRIE_FINAL_VALUE) {
michael@0:                 break;
michael@0:             }
michael@0:         }
michael@0:         else if (result == USTRINGTRIE_NO_MATCH) {
michael@0:             break;
michael@0:         }
michael@0: 
michael@0:         // TODO: why do we have a text limit if the UText knows its length?
michael@0:         if (numChars >= maxLength) {
michael@0:             break;
michael@0:         }
michael@0: 
michael@0:         c = utext_next32(text);
michael@0:         if (c < 0) {
michael@0:             break;
michael@0:         }
michael@0:         ++numChars;
michael@0:         result = bt.next(transform(c));
michael@0:     }
michael@0:     return numChars;
michael@0: }
michael@0: 
michael@0: 
michael@0: U_NAMESPACE_END
michael@0: 
michael@0: U_NAMESPACE_USE
michael@0: 
michael@0: U_CAPI int32_t U_EXPORT2
michael@0: udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
michael@0:            void *outData, UErrorCode *pErrorCode) {
michael@0:     const UDataInfo *pInfo;
michael@0:     int32_t headerSize;
michael@0:     const uint8_t *inBytes;
michael@0:     uint8_t *outBytes;
michael@0:     const int32_t *inIndexes;
michael@0:     int32_t indexes[DictionaryData::IX_COUNT];
michael@0:     int32_t i, offset, size;
michael@0: 
michael@0:     headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
michael@0:     if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
michael@0:     pInfo = (const UDataInfo *)((const char *)inData + 4);
michael@0:     if (!(pInfo->dataFormat[0] == 0x44 && 
michael@0:           pInfo->dataFormat[1] == 0x69 && 
michael@0:           pInfo->dataFormat[2] == 0x63 && 
michael@0:           pInfo->dataFormat[3] == 0x74 && 
michael@0:           pInfo->formatVersion[0] == 1)) {
michael@0:         udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
michael@0:                          pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
michael@0:         *pErrorCode = U_UNSUPPORTED_ERROR;
michael@0:         return 0;
michael@0:     }
michael@0: 
michael@0:     inBytes = (const uint8_t *)inData + headerSize;
michael@0:     outBytes = (uint8_t *)outData + headerSize;
michael@0: 
michael@0:     inIndexes = (const int32_t *)inBytes;
michael@0:     if (length >= 0) {
michael@0:         length -= headerSize;
michael@0:         if (length < (int32_t)(sizeof(indexes))) {
michael@0:             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
michael@0:             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0:             return 0;
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     for (i = 0; i < DictionaryData::IX_COUNT; i++) {
michael@0:         indexes[i] = udata_readInt32(ds, inIndexes[i]);
michael@0:     }
michael@0: 
michael@0:     size = indexes[DictionaryData::IX_TOTAL_SIZE];
michael@0: 
michael@0:     if (length >= 0) {
michael@0:         if (length < size) {
michael@0:             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
michael@0:             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0:             return 0;
michael@0:         }
michael@0: 
michael@0:         if (inBytes != outBytes) {
michael@0:             uprv_memcpy(outBytes, inBytes, size);
michael@0:         }
michael@0: 
michael@0:         offset = 0;
michael@0:         ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
michael@0:         offset = (int32_t)sizeof(indexes);
michael@0:         int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
michael@0:         int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
michael@0: 
michael@0:         if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
michael@0:             ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
michael@0:         } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
michael@0:             // nothing to do
michael@0:         } else {
michael@0:             udata_printError(ds, "udict_swap(): unknown trie type!\n");
michael@0:             *pErrorCode = U_UNSUPPORTED_ERROR;
michael@0:             return 0;
michael@0:         }
michael@0: 
michael@0:         // these next two sections are empty in the current format,
michael@0:         // but may be used later.
michael@0:         offset = nextOffset;
michael@0:         nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
michael@0:         offset = nextOffset;
michael@0:         nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
michael@0:         offset = nextOffset;
michael@0:     }
michael@0:     return headerSize + size;
michael@0: }
michael@0: #endif