michael@0: // Copyright 2014 Google Inc. All Rights Reserved. michael@0: // michael@0: // Licensed under the Apache License, Version 2.0 (the "License"); michael@0: // you may not use this file except in compliance with the License. michael@0: // You may obtain a copy of the License at michael@0: // michael@0: // http://www.apache.org/licenses/LICENSE-2.0 michael@0: // michael@0: // Unless required by applicable law or agreed to in writing, software michael@0: // distributed under the License is distributed on an "AS IS" BASIS, michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. michael@0: // See the License for the specific language governing permissions and michael@0: // limitations under the License. michael@0: michael@0: #ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_ michael@0: #define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_ michael@0: michael@0: #include "integral_types.h" michael@0: #include "cld2tablesummary.h" michael@0: #include "utf8statetable.h" michael@0: #include "scoreonescriptspan.h" michael@0: michael@0: /* michael@0: There are two primary parts to a CLD2 dynamic data file: michael@0: 1. A header, wherein trivial data, block lengths and block offsets are kept michael@0: 2. A data block, wherein the large binary blocks are kept michael@0: michael@0: By reading the header, an application can determine the offsets and lengths of michael@0: all the data blocks for all tables. Offsets in the header are expressed michael@0: relative to the first byte of the file, inclusive of the header itself; thus, michael@0: any offset whose value is less than the length of the header is invalid. michael@0: michael@0: Any offset whose value is zero indicates a field that is null in the michael@0: underlying CLD2 data; a real example of this is the fast_state field of the michael@0: UTF8PropObj, which may be null. michael@0: michael@0: The size of the header can be precalculated by calling calculateHeaderSize(), michael@0: which will indicate the exact size of the header for a data file that contains michael@0: a given number of CLD2TableSummary objects. michael@0: michael@0: Notes on endianness: michael@0: The data format is only suitable for little-endian machines. For big-endian michael@0: systems, a tedious transformation would need to be made first to reverse the michael@0: byte order of significant portions of the binary - not just the lengths, but michael@0: also some of the underlying table data. michael@0: michael@0: Note on 32/64 bit: michael@0: The data format is agnostic to 32/64 bit pointers. All the offsets within the michael@0: data blob itself are 32-bit values relative to the start of the file, and the michael@0: file should certainly never be gigabytes in size! michael@0: When the file is ultimately read by the loading code and mmap()'d, new michael@0: pointers are generated at whatever size the system uses, initialized to the michael@0: start of the mmap, and incremented by the 32-bit offset. This should be safe michael@0: regardless of 32- or 64-bit architectures. michael@0: michael@0: -------------------------------------------------------------------- michael@0: FIELD michael@0: -------------------------------------------------------------------- michael@0: DATA_FILE_MARKER (no null terminator) michael@0: total file size (sanity check, uint32) michael@0: -------------------------------------------------------------------- michael@0: UTF8PropObj: const uint32 state0 michael@0: UTF8PropObj: const uint32 state0_size michael@0: UTF8PropObj: const uint32 total_size michael@0: UTF8PropObj: const int max_expand michael@0: UTF8PropObj: const int entry_shift (coerced to 32 bits) michael@0: UTF8PropObj: const int bytes_per_entry (coerced to 32 bits) michael@0: UTF8PropObj: const uint32 losub michael@0: UTF8PropObj: const uint32 hiadd michael@0: offset of UTF8PropObj: const uint8* state_table michael@0: length of UTF8PropObj: const uint8* state_table michael@0: offset of UTF8PropObj: const RemapEntry* remap_base (4-byte struct) michael@0: length of UTF8PropObj: const RemapEntry* remap_base (4-byte struct) michael@0: offset of UTF8PropObj: const uint8* remap_string michael@0: length of UTF8PropObj: const uint8* remap_string michael@0: offset of UTF8PropObj: const uint8* fast_state michael@0: length of UTF8PropObj: const uint8* fast_state michael@0: -------------------------------------------------------------------- michael@0: start of const short kAvgDeltaOctaScore[] michael@0: length of const short kAvgDeltaOctaScore[] michael@0: -------------------------------------------------------------------- michael@0: number of CLD2TableSummary objects encoded (n) michael@0: [Table 1]: CLD2TableSummary: uint32 kCLDTableSizeOne michael@0: [Table 1]: CLD2TableSummary: uint32 kCLDTableSize michael@0: [Table 1]: CLD2TableSummary: uint32 kCLDTableKeyMask michael@0: [Table 1]: CLD2TableSummary: uint32 kCLDTableBuildDate michael@0: [Table 1]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable michael@0: [Table 1]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable michael@0: [Table 1]: offset of CLD2TableSummary: const uint32* kCLDTableInd michael@0: [Table 1]: length of CLD2TableSummary: const uint32* kCLDTableInd michael@0: [Table 1]: offset of CLD2TableSummary: const char* kRecognizedLangScripts michael@0: [Table 1]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1 michael@0: . michael@0: . michael@0: . michael@0: [Table n]: CLD2TableSummary: uint32 kCLDTableSizeOne michael@0: [Table n]: CLD2TableSummary: uint32 kCLDTableSize michael@0: [Table n]: CLD2TableSummary: uint32 kCLDTableKeyMask michael@0: [Table n]: CLD2TableSummary: uint32 kCLDTableBuildDate michael@0: [Table n]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable michael@0: [Table n]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable michael@0: [Table n]: offset of CLD2TableSummary: const uint32* kCLDTableInd michael@0: [Table n]: length of CLD2TableSummary: const uint32* kCLDTableInd michael@0: [Table n]: offset of CLD2TableSummary: const char* kRecognizedLangScripts michael@0: [Table n]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1 michael@0: -------------------------------------------------------------------- michael@0: michael@0: michael@0: Immediately after the header fields comes the data block. The data block has michael@0: the following content, in this order (note that padding is applied in order to michael@0: keep lookups word-aligned): michael@0: michael@0: UTF8PropObj: const uint8* state_table michael@0: UTF8PropObj: const RemapEntry* remap_base (4-byte struct) michael@0: UTF8PropObj: const uint8* remap_string michael@0: UTF8PropObj: const uint8* fast_state michael@0: const short kAvgDeltaOctaScore[] michael@0: [Table 1]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable michael@0: [Table 1]: CLD2TableSummary: const uint32* kCLDTableInd michael@0: [Table 1]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator) michael@0: . michael@0: . michael@0: . michael@0: [Table n]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable michael@0: [Table n]: CLD2TableSummary: const uint32* kCLDTableInd michael@0: [Table n]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator) michael@0: michael@0: michael@0: It is STRONGLY recommended that the chunks within the data block be kept michael@0: 128-bit aligned for efficiency reasons, although the code will work without michael@0: such alignment: the main lookup tables have randomly-accessed groups of four michael@0: 4-byte entries, and these must be 16-byte aligned to avoid the performance michael@0: cost of multiple cache misses per group. michael@0: */ michael@0: namespace CLD2DynamicData { michael@0: michael@0: static const char* DATA_FILE_MARKER = "cld2_data_file00"; michael@0: static const int DATA_FILE_MARKER_LENGTH = 16; // Keep aligned to 128 bits michael@0: michael@0: // Nicer version of memcmp that shows the offset at which bytes differ michael@0: bool mem_compare(const void* data1, const void* data2, const int length); michael@0: michael@0: // Enable or disable debugging; 0 to disable, 1 to enable michael@0: void setDebug(int debug); michael@0: michael@0: // Lower-level structure for individual tables. There are n table headers in michael@0: // a given file header. michael@0: typedef struct { michael@0: CLD2::uint32 kCLDTableSizeOne; michael@0: CLD2::uint32 kCLDTableSize; michael@0: CLD2::uint32 kCLDTableKeyMask; michael@0: CLD2::uint32 kCLDTableBuildDate; michael@0: CLD2::uint32 startOf_kCLDTable; michael@0: CLD2::uint32 lengthOf_kCLDTable; michael@0: CLD2::uint32 startOf_kCLDTableInd; michael@0: CLD2::uint32 lengthOf_kCLDTableInd; michael@0: CLD2::uint32 startOf_kRecognizedLangScripts; michael@0: CLD2::uint32 lengthOf_kRecognizedLangScripts; michael@0: } TableHeader; michael@0: michael@0: michael@0: // Top-level structure for a CLD2 Data File Header. michael@0: // Contains all the primitive fields for the header as well as an array of michael@0: // headers for the individual tables. michael@0: typedef struct { michael@0: // Marker fields help recognize and verify the data file michael@0: char sanityString[DATA_FILE_MARKER_LENGTH]; michael@0: CLD2::uint32 totalFileSizeBytes; michael@0: michael@0: // UTF8 primitives michael@0: CLD2::uint32 utf8PropObj_state0; michael@0: CLD2::uint32 utf8PropObj_state0_size; michael@0: CLD2::uint32 utf8PropObj_total_size; michael@0: CLD2::uint32 utf8PropObj_max_expand; michael@0: CLD2::uint32 utf8PropObj_entry_shift; michael@0: CLD2::uint32 utf8PropObj_bytes_per_entry; michael@0: CLD2::uint32 utf8PropObj_losub; michael@0: CLD2::uint32 utf8PropObj_hiadd; michael@0: CLD2::uint32 startOf_utf8PropObj_state_table; michael@0: CLD2::uint32 lengthOf_utf8PropObj_state_table; michael@0: CLD2::uint32 startOf_utf8PropObj_remap_base; michael@0: CLD2::uint32 lengthOf_utf8PropObj_remap_base; michael@0: CLD2::uint32 startOf_utf8PropObj_remap_string; michael@0: CLD2::uint32 lengthOf_utf8PropObj_remap_string; michael@0: CLD2::uint32 startOf_utf8PropObj_fast_state; michael@0: CLD2::uint32 lengthOf_utf8PropObj_fast_state; michael@0: michael@0: // Average delta-octa-score bits michael@0: CLD2::uint32 startOf_kAvgDeltaOctaScore; michael@0: CLD2::uint32 lengthOf_kAvgDeltaOctaScore; michael@0: michael@0: // Table bits michael@0: CLD2::uint32 numTablesEncoded; michael@0: TableHeader* tableHeaders; michael@0: } FileHeader; michael@0: michael@0: // Calculate the exact size of a header that encodes the specified number of michael@0: // tables. This can be used to reserve space within the data file, michael@0: // calculate offsets, and so on. michael@0: CLD2::uint32 calculateHeaderSize(CLD2::uint32 numTables); michael@0: michael@0: // Dump a given header to stdout as a human-readable string. michael@0: void dumpHeader(FileHeader* header); michael@0: michael@0: // Verify that a given pair of scoring tables match precisely michael@0: // If there is a problem, returns an error message; otherwise, the empty string. michael@0: bool verify(const CLD2::ScoringTables* realData, const CLD2::ScoringTables* loadedData); michael@0: michael@0: // Return true iff the program is running in little-endian mode. michael@0: bool isLittleEndian(); michael@0: michael@0: // Return true iff the core size assumptions are ok on this platform. michael@0: bool coreAssumptionsOk(); michael@0: michael@0: } // End namespace CLD2DynamicData michael@0: #endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_