Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | // Copyright 2014 Google Inc. All Rights Reserved. |
michael@0 | 2 | // |
michael@0 | 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
michael@0 | 4 | // you may not use this file except in compliance with the License. |
michael@0 | 5 | // You may obtain a copy of the License at |
michael@0 | 6 | // |
michael@0 | 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
michael@0 | 8 | // |
michael@0 | 9 | // Unless required by applicable law or agreed to in writing, software |
michael@0 | 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
michael@0 | 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
michael@0 | 12 | // See the License for the specific language governing permissions and |
michael@0 | 13 | // limitations under the License. |
michael@0 | 14 | |
michael@0 | 15 | #ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_ |
michael@0 | 16 | #define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_ |
michael@0 | 17 | |
michael@0 | 18 | #include "integral_types.h" |
michael@0 | 19 | #include "cld2tablesummary.h" |
michael@0 | 20 | #include "utf8statetable.h" |
michael@0 | 21 | #include "scoreonescriptspan.h" |
michael@0 | 22 | |
michael@0 | 23 | /* |
michael@0 | 24 | There are two primary parts to a CLD2 dynamic data file: |
michael@0 | 25 | 1. A header, wherein trivial data, block lengths and block offsets are kept |
michael@0 | 26 | 2. A data block, wherein the large binary blocks are kept |
michael@0 | 27 | |
michael@0 | 28 | By reading the header, an application can determine the offsets and lengths of |
michael@0 | 29 | all the data blocks for all tables. Offsets in the header are expressed |
michael@0 | 30 | relative to the first byte of the file, inclusive of the header itself; thus, |
michael@0 | 31 | any offset whose value is less than the length of the header is invalid. |
michael@0 | 32 | |
michael@0 | 33 | Any offset whose value is zero indicates a field that is null in the |
michael@0 | 34 | underlying CLD2 data; a real example of this is the fast_state field of the |
michael@0 | 35 | UTF8PropObj, which may be null. |
michael@0 | 36 | |
michael@0 | 37 | The size of the header can be precalculated by calling calculateHeaderSize(), |
michael@0 | 38 | which will indicate the exact size of the header for a data file that contains |
michael@0 | 39 | a given number of CLD2TableSummary objects. |
michael@0 | 40 | |
michael@0 | 41 | Notes on endianness: |
michael@0 | 42 | The data format is only suitable for little-endian machines. For big-endian |
michael@0 | 43 | systems, a tedious transformation would need to be made first to reverse the |
michael@0 | 44 | byte order of significant portions of the binary - not just the lengths, but |
michael@0 | 45 | also some of the underlying table data. |
michael@0 | 46 | |
michael@0 | 47 | Note on 32/64 bit: |
michael@0 | 48 | The data format is agnostic to 32/64 bit pointers. All the offsets within the |
michael@0 | 49 | data blob itself are 32-bit values relative to the start of the file, and the |
michael@0 | 50 | file should certainly never be gigabytes in size! |
michael@0 | 51 | When the file is ultimately read by the loading code and mmap()'d, new |
michael@0 | 52 | pointers are generated at whatever size the system uses, initialized to the |
michael@0 | 53 | start of the mmap, and incremented by the 32-bit offset. This should be safe |
michael@0 | 54 | regardless of 32- or 64-bit architectures. |
michael@0 | 55 | |
michael@0 | 56 | -------------------------------------------------------------------- |
michael@0 | 57 | FIELD |
michael@0 | 58 | -------------------------------------------------------------------- |
michael@0 | 59 | DATA_FILE_MARKER (no null terminator) |
michael@0 | 60 | total file size (sanity check, uint32) |
michael@0 | 61 | -------------------------------------------------------------------- |
michael@0 | 62 | UTF8PropObj: const uint32 state0 |
michael@0 | 63 | UTF8PropObj: const uint32 state0_size |
michael@0 | 64 | UTF8PropObj: const uint32 total_size |
michael@0 | 65 | UTF8PropObj: const int max_expand |
michael@0 | 66 | UTF8PropObj: const int entry_shift (coerced to 32 bits) |
michael@0 | 67 | UTF8PropObj: const int bytes_per_entry (coerced to 32 bits) |
michael@0 | 68 | UTF8PropObj: const uint32 losub |
michael@0 | 69 | UTF8PropObj: const uint32 hiadd |
michael@0 | 70 | offset of UTF8PropObj: const uint8* state_table |
michael@0 | 71 | length of UTF8PropObj: const uint8* state_table |
michael@0 | 72 | offset of UTF8PropObj: const RemapEntry* remap_base (4-byte struct) |
michael@0 | 73 | length of UTF8PropObj: const RemapEntry* remap_base (4-byte struct) |
michael@0 | 74 | offset of UTF8PropObj: const uint8* remap_string |
michael@0 | 75 | length of UTF8PropObj: const uint8* remap_string |
michael@0 | 76 | offset of UTF8PropObj: const uint8* fast_state |
michael@0 | 77 | length of UTF8PropObj: const uint8* fast_state |
michael@0 | 78 | -------------------------------------------------------------------- |
michael@0 | 79 | start of const short kAvgDeltaOctaScore[] |
michael@0 | 80 | length of const short kAvgDeltaOctaScore[] |
michael@0 | 81 | -------------------------------------------------------------------- |
michael@0 | 82 | number of CLD2TableSummary objects encoded (n) |
michael@0 | 83 | [Table 1]: CLD2TableSummary: uint32 kCLDTableSizeOne |
michael@0 | 84 | [Table 1]: CLD2TableSummary: uint32 kCLDTableSize |
michael@0 | 85 | [Table 1]: CLD2TableSummary: uint32 kCLDTableKeyMask |
michael@0 | 86 | [Table 1]: CLD2TableSummary: uint32 kCLDTableBuildDate |
michael@0 | 87 | [Table 1]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable |
michael@0 | 88 | [Table 1]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable |
michael@0 | 89 | [Table 1]: offset of CLD2TableSummary: const uint32* kCLDTableInd |
michael@0 | 90 | [Table 1]: length of CLD2TableSummary: const uint32* kCLDTableInd |
michael@0 | 91 | [Table 1]: offset of CLD2TableSummary: const char* kRecognizedLangScripts |
michael@0 | 92 | [Table 1]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1 |
michael@0 | 93 | . |
michael@0 | 94 | . |
michael@0 | 95 | . |
michael@0 | 96 | [Table n]: CLD2TableSummary: uint32 kCLDTableSizeOne |
michael@0 | 97 | [Table n]: CLD2TableSummary: uint32 kCLDTableSize |
michael@0 | 98 | [Table n]: CLD2TableSummary: uint32 kCLDTableKeyMask |
michael@0 | 99 | [Table n]: CLD2TableSummary: uint32 kCLDTableBuildDate |
michael@0 | 100 | [Table n]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable |
michael@0 | 101 | [Table n]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable |
michael@0 | 102 | [Table n]: offset of CLD2TableSummary: const uint32* kCLDTableInd |
michael@0 | 103 | [Table n]: length of CLD2TableSummary: const uint32* kCLDTableInd |
michael@0 | 104 | [Table n]: offset of CLD2TableSummary: const char* kRecognizedLangScripts |
michael@0 | 105 | [Table n]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1 |
michael@0 | 106 | -------------------------------------------------------------------- |
michael@0 | 107 | |
michael@0 | 108 | |
michael@0 | 109 | Immediately after the header fields comes the data block. The data block has |
michael@0 | 110 | the following content, in this order (note that padding is applied in order to |
michael@0 | 111 | keep lookups word-aligned): |
michael@0 | 112 | |
michael@0 | 113 | UTF8PropObj: const uint8* state_table |
michael@0 | 114 | UTF8PropObj: const RemapEntry* remap_base (4-byte struct) |
michael@0 | 115 | UTF8PropObj: const uint8* remap_string |
michael@0 | 116 | UTF8PropObj: const uint8* fast_state |
michael@0 | 117 | const short kAvgDeltaOctaScore[] |
michael@0 | 118 | [Table 1]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable |
michael@0 | 119 | [Table 1]: CLD2TableSummary: const uint32* kCLDTableInd |
michael@0 | 120 | [Table 1]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator) |
michael@0 | 121 | . |
michael@0 | 122 | . |
michael@0 | 123 | . |
michael@0 | 124 | [Table n]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable |
michael@0 | 125 | [Table n]: CLD2TableSummary: const uint32* kCLDTableInd |
michael@0 | 126 | [Table n]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator) |
michael@0 | 127 | |
michael@0 | 128 | |
michael@0 | 129 | It is STRONGLY recommended that the chunks within the data block be kept |
michael@0 | 130 | 128-bit aligned for efficiency reasons, although the code will work without |
michael@0 | 131 | such alignment: the main lookup tables have randomly-accessed groups of four |
michael@0 | 132 | 4-byte entries, and these must be 16-byte aligned to avoid the performance |
michael@0 | 133 | cost of multiple cache misses per group. |
michael@0 | 134 | */ |
michael@0 | 135 | namespace CLD2DynamicData { |
michael@0 | 136 | |
michael@0 | 137 | static const char* DATA_FILE_MARKER = "cld2_data_file00"; |
michael@0 | 138 | static const int DATA_FILE_MARKER_LENGTH = 16; // Keep aligned to 128 bits |
michael@0 | 139 | |
michael@0 | 140 | // Nicer version of memcmp that shows the offset at which bytes differ |
michael@0 | 141 | bool mem_compare(const void* data1, const void* data2, const int length); |
michael@0 | 142 | |
michael@0 | 143 | // Enable or disable debugging; 0 to disable, 1 to enable |
michael@0 | 144 | void setDebug(int debug); |
michael@0 | 145 | |
michael@0 | 146 | // Lower-level structure for individual tables. There are n table headers in |
michael@0 | 147 | // a given file header. |
michael@0 | 148 | typedef struct { |
michael@0 | 149 | CLD2::uint32 kCLDTableSizeOne; |
michael@0 | 150 | CLD2::uint32 kCLDTableSize; |
michael@0 | 151 | CLD2::uint32 kCLDTableKeyMask; |
michael@0 | 152 | CLD2::uint32 kCLDTableBuildDate; |
michael@0 | 153 | CLD2::uint32 startOf_kCLDTable; |
michael@0 | 154 | CLD2::uint32 lengthOf_kCLDTable; |
michael@0 | 155 | CLD2::uint32 startOf_kCLDTableInd; |
michael@0 | 156 | CLD2::uint32 lengthOf_kCLDTableInd; |
michael@0 | 157 | CLD2::uint32 startOf_kRecognizedLangScripts; |
michael@0 | 158 | CLD2::uint32 lengthOf_kRecognizedLangScripts; |
michael@0 | 159 | } TableHeader; |
michael@0 | 160 | |
michael@0 | 161 | |
michael@0 | 162 | // Top-level structure for a CLD2 Data File Header. |
michael@0 | 163 | // Contains all the primitive fields for the header as well as an array of |
michael@0 | 164 | // headers for the individual tables. |
michael@0 | 165 | typedef struct { |
michael@0 | 166 | // Marker fields help recognize and verify the data file |
michael@0 | 167 | char sanityString[DATA_FILE_MARKER_LENGTH]; |
michael@0 | 168 | CLD2::uint32 totalFileSizeBytes; |
michael@0 | 169 | |
michael@0 | 170 | // UTF8 primitives |
michael@0 | 171 | CLD2::uint32 utf8PropObj_state0; |
michael@0 | 172 | CLD2::uint32 utf8PropObj_state0_size; |
michael@0 | 173 | CLD2::uint32 utf8PropObj_total_size; |
michael@0 | 174 | CLD2::uint32 utf8PropObj_max_expand; |
michael@0 | 175 | CLD2::uint32 utf8PropObj_entry_shift; |
michael@0 | 176 | CLD2::uint32 utf8PropObj_bytes_per_entry; |
michael@0 | 177 | CLD2::uint32 utf8PropObj_losub; |
michael@0 | 178 | CLD2::uint32 utf8PropObj_hiadd; |
michael@0 | 179 | CLD2::uint32 startOf_utf8PropObj_state_table; |
michael@0 | 180 | CLD2::uint32 lengthOf_utf8PropObj_state_table; |
michael@0 | 181 | CLD2::uint32 startOf_utf8PropObj_remap_base; |
michael@0 | 182 | CLD2::uint32 lengthOf_utf8PropObj_remap_base; |
michael@0 | 183 | CLD2::uint32 startOf_utf8PropObj_remap_string; |
michael@0 | 184 | CLD2::uint32 lengthOf_utf8PropObj_remap_string; |
michael@0 | 185 | CLD2::uint32 startOf_utf8PropObj_fast_state; |
michael@0 | 186 | CLD2::uint32 lengthOf_utf8PropObj_fast_state; |
michael@0 | 187 | |
michael@0 | 188 | // Average delta-octa-score bits |
michael@0 | 189 | CLD2::uint32 startOf_kAvgDeltaOctaScore; |
michael@0 | 190 | CLD2::uint32 lengthOf_kAvgDeltaOctaScore; |
michael@0 | 191 | |
michael@0 | 192 | // Table bits |
michael@0 | 193 | CLD2::uint32 numTablesEncoded; |
michael@0 | 194 | TableHeader* tableHeaders; |
michael@0 | 195 | } FileHeader; |
michael@0 | 196 | |
michael@0 | 197 | // Calculate the exact size of a header that encodes the specified number of |
michael@0 | 198 | // tables. This can be used to reserve space within the data file, |
michael@0 | 199 | // calculate offsets, and so on. |
michael@0 | 200 | CLD2::uint32 calculateHeaderSize(CLD2::uint32 numTables); |
michael@0 | 201 | |
michael@0 | 202 | // Dump a given header to stdout as a human-readable string. |
michael@0 | 203 | void dumpHeader(FileHeader* header); |
michael@0 | 204 | |
michael@0 | 205 | // Verify that a given pair of scoring tables match precisely |
michael@0 | 206 | // If there is a problem, returns an error message; otherwise, the empty string. |
michael@0 | 207 | bool verify(const CLD2::ScoringTables* realData, const CLD2::ScoringTables* loadedData); |
michael@0 | 208 | |
michael@0 | 209 | // Return true iff the program is running in little-endian mode. |
michael@0 | 210 | bool isLittleEndian(); |
michael@0 | 211 | |
michael@0 | 212 | // Return true iff the core size assumptions are ok on this platform. |
michael@0 | 213 | bool coreAssumptionsOk(); |
michael@0 | 214 | |
michael@0 | 215 | } // End namespace CLD2DynamicData |
michael@0 | 216 | #endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_ |