|
1 // Copyright 2014 Google Inc. All Rights Reserved. |
|
2 // |
|
3 // Licensed under the Apache License, Version 2.0 (the "License"); |
|
4 // you may not use this file except in compliance with the License. |
|
5 // You may obtain a copy of the License at |
|
6 // |
|
7 // http://www.apache.org/licenses/LICENSE-2.0 |
|
8 // |
|
9 // Unless required by applicable law or agreed to in writing, software |
|
10 // distributed under the License is distributed on an "AS IS" BASIS, |
|
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
12 // See the License for the specific language governing permissions and |
|
13 // limitations under the License. |
|
14 |
|
15 #ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_ |
|
16 #define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_ |
|
17 |
|
18 #include "integral_types.h" |
|
19 #include "cld2tablesummary.h" |
|
20 #include "utf8statetable.h" |
|
21 #include "scoreonescriptspan.h" |
|
22 |
|
23 /* |
|
24 There are two primary parts to a CLD2 dynamic data file: |
|
25 1. A header, wherein trivial data, block lengths and block offsets are kept |
|
26 2. A data block, wherein the large binary blocks are kept |
|
27 |
|
28 By reading the header, an application can determine the offsets and lengths of |
|
29 all the data blocks for all tables. Offsets in the header are expressed |
|
30 relative to the first byte of the file, inclusive of the header itself; thus, |
|
31 any offset whose value is less than the length of the header is invalid. |
|
32 |
|
33 Any offset whose value is zero indicates a field that is null in the |
|
34 underlying CLD2 data; a real example of this is the fast_state field of the |
|
35 UTF8PropObj, which may be null. |
|
36 |
|
37 The size of the header can be precalculated by calling calculateHeaderSize(), |
|
38 which will indicate the exact size of the header for a data file that contains |
|
39 a given number of CLD2TableSummary objects. |
|
40 |
|
41 Notes on endianness: |
|
42 The data format is only suitable for little-endian machines. For big-endian |
|
43 systems, a tedious transformation would need to be made first to reverse the |
|
44 byte order of significant portions of the binary - not just the lengths, but |
|
45 also some of the underlying table data. |
|
46 |
|
47 Note on 32/64 bit: |
|
48 The data format is agnostic to 32/64 bit pointers. All the offsets within the |
|
49 data blob itself are 32-bit values relative to the start of the file, and the |
|
50 file should certainly never be gigabytes in size! |
|
51 When the file is ultimately read by the loading code and mmap()'d, new |
|
52 pointers are generated at whatever size the system uses, initialized to the |
|
53 start of the mmap, and incremented by the 32-bit offset. This should be safe |
|
54 regardless of 32- or 64-bit architectures. |
|
55 |
|
56 -------------------------------------------------------------------- |
|
57 FIELD |
|
58 -------------------------------------------------------------------- |
|
59 DATA_FILE_MARKER (no null terminator) |
|
60 total file size (sanity check, uint32) |
|
61 -------------------------------------------------------------------- |
|
62 UTF8PropObj: const uint32 state0 |
|
63 UTF8PropObj: const uint32 state0_size |
|
64 UTF8PropObj: const uint32 total_size |
|
65 UTF8PropObj: const int max_expand |
|
66 UTF8PropObj: const int entry_shift (coerced to 32 bits) |
|
67 UTF8PropObj: const int bytes_per_entry (coerced to 32 bits) |
|
68 UTF8PropObj: const uint32 losub |
|
69 UTF8PropObj: const uint32 hiadd |
|
70 offset of UTF8PropObj: const uint8* state_table |
|
71 length of UTF8PropObj: const uint8* state_table |
|
72 offset of UTF8PropObj: const RemapEntry* remap_base (4-byte struct) |
|
73 length of UTF8PropObj: const RemapEntry* remap_base (4-byte struct) |
|
74 offset of UTF8PropObj: const uint8* remap_string |
|
75 length of UTF8PropObj: const uint8* remap_string |
|
76 offset of UTF8PropObj: const uint8* fast_state |
|
77 length of UTF8PropObj: const uint8* fast_state |
|
78 -------------------------------------------------------------------- |
|
79 start of const short kAvgDeltaOctaScore[] |
|
80 length of const short kAvgDeltaOctaScore[] |
|
81 -------------------------------------------------------------------- |
|
82 number of CLD2TableSummary objects encoded (n) |
|
83 [Table 1]: CLD2TableSummary: uint32 kCLDTableSizeOne |
|
84 [Table 1]: CLD2TableSummary: uint32 kCLDTableSize |
|
85 [Table 1]: CLD2TableSummary: uint32 kCLDTableKeyMask |
|
86 [Table 1]: CLD2TableSummary: uint32 kCLDTableBuildDate |
|
87 [Table 1]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable |
|
88 [Table 1]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable |
|
89 [Table 1]: offset of CLD2TableSummary: const uint32* kCLDTableInd |
|
90 [Table 1]: length of CLD2TableSummary: const uint32* kCLDTableInd |
|
91 [Table 1]: offset of CLD2TableSummary: const char* kRecognizedLangScripts |
|
92 [Table 1]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1 |
|
93 . |
|
94 . |
|
95 . |
|
96 [Table n]: CLD2TableSummary: uint32 kCLDTableSizeOne |
|
97 [Table n]: CLD2TableSummary: uint32 kCLDTableSize |
|
98 [Table n]: CLD2TableSummary: uint32 kCLDTableKeyMask |
|
99 [Table n]: CLD2TableSummary: uint32 kCLDTableBuildDate |
|
100 [Table n]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable |
|
101 [Table n]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable |
|
102 [Table n]: offset of CLD2TableSummary: const uint32* kCLDTableInd |
|
103 [Table n]: length of CLD2TableSummary: const uint32* kCLDTableInd |
|
104 [Table n]: offset of CLD2TableSummary: const char* kRecognizedLangScripts |
|
105 [Table n]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1 |
|
106 -------------------------------------------------------------------- |
|
107 |
|
108 |
|
109 Immediately after the header fields comes the data block. The data block has |
|
110 the following content, in this order (note that padding is applied in order to |
|
111 keep lookups word-aligned): |
|
112 |
|
113 UTF8PropObj: const uint8* state_table |
|
114 UTF8PropObj: const RemapEntry* remap_base (4-byte struct) |
|
115 UTF8PropObj: const uint8* remap_string |
|
116 UTF8PropObj: const uint8* fast_state |
|
117 const short kAvgDeltaOctaScore[] |
|
118 [Table 1]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable |
|
119 [Table 1]: CLD2TableSummary: const uint32* kCLDTableInd |
|
120 [Table 1]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator) |
|
121 . |
|
122 . |
|
123 . |
|
124 [Table n]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable |
|
125 [Table n]: CLD2TableSummary: const uint32* kCLDTableInd |
|
126 [Table n]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator) |
|
127 |
|
128 |
|
129 It is STRONGLY recommended that the chunks within the data block be kept |
|
130 128-bit aligned for efficiency reasons, although the code will work without |
|
131 such alignment: the main lookup tables have randomly-accessed groups of four |
|
132 4-byte entries, and these must be 16-byte aligned to avoid the performance |
|
133 cost of multiple cache misses per group. |
|
134 */ |
|
135 namespace CLD2DynamicData { |
|
136 |
|
137 static const char* DATA_FILE_MARKER = "cld2_data_file00"; |
|
138 static const int DATA_FILE_MARKER_LENGTH = 16; // Keep aligned to 128 bits |
|
139 |
|
140 // Nicer version of memcmp that shows the offset at which bytes differ |
|
141 bool mem_compare(const void* data1, const void* data2, const int length); |
|
142 |
|
143 // Enable or disable debugging; 0 to disable, 1 to enable |
|
144 void setDebug(int debug); |
|
145 |
|
146 // Lower-level structure for individual tables. There are n table headers in |
|
147 // a given file header. |
|
148 typedef struct { |
|
149 CLD2::uint32 kCLDTableSizeOne; |
|
150 CLD2::uint32 kCLDTableSize; |
|
151 CLD2::uint32 kCLDTableKeyMask; |
|
152 CLD2::uint32 kCLDTableBuildDate; |
|
153 CLD2::uint32 startOf_kCLDTable; |
|
154 CLD2::uint32 lengthOf_kCLDTable; |
|
155 CLD2::uint32 startOf_kCLDTableInd; |
|
156 CLD2::uint32 lengthOf_kCLDTableInd; |
|
157 CLD2::uint32 startOf_kRecognizedLangScripts; |
|
158 CLD2::uint32 lengthOf_kRecognizedLangScripts; |
|
159 } TableHeader; |
|
160 |
|
161 |
|
162 // Top-level structure for a CLD2 Data File Header. |
|
163 // Contains all the primitive fields for the header as well as an array of |
|
164 // headers for the individual tables. |
|
165 typedef struct { |
|
166 // Marker fields help recognize and verify the data file |
|
167 char sanityString[DATA_FILE_MARKER_LENGTH]; |
|
168 CLD2::uint32 totalFileSizeBytes; |
|
169 |
|
170 // UTF8 primitives |
|
171 CLD2::uint32 utf8PropObj_state0; |
|
172 CLD2::uint32 utf8PropObj_state0_size; |
|
173 CLD2::uint32 utf8PropObj_total_size; |
|
174 CLD2::uint32 utf8PropObj_max_expand; |
|
175 CLD2::uint32 utf8PropObj_entry_shift; |
|
176 CLD2::uint32 utf8PropObj_bytes_per_entry; |
|
177 CLD2::uint32 utf8PropObj_losub; |
|
178 CLD2::uint32 utf8PropObj_hiadd; |
|
179 CLD2::uint32 startOf_utf8PropObj_state_table; |
|
180 CLD2::uint32 lengthOf_utf8PropObj_state_table; |
|
181 CLD2::uint32 startOf_utf8PropObj_remap_base; |
|
182 CLD2::uint32 lengthOf_utf8PropObj_remap_base; |
|
183 CLD2::uint32 startOf_utf8PropObj_remap_string; |
|
184 CLD2::uint32 lengthOf_utf8PropObj_remap_string; |
|
185 CLD2::uint32 startOf_utf8PropObj_fast_state; |
|
186 CLD2::uint32 lengthOf_utf8PropObj_fast_state; |
|
187 |
|
188 // Average delta-octa-score bits |
|
189 CLD2::uint32 startOf_kAvgDeltaOctaScore; |
|
190 CLD2::uint32 lengthOf_kAvgDeltaOctaScore; |
|
191 |
|
192 // Table bits |
|
193 CLD2::uint32 numTablesEncoded; |
|
194 TableHeader* tableHeaders; |
|
195 } FileHeader; |
|
196 |
|
197 // Calculate the exact size of a header that encodes the specified number of |
|
198 // tables. This can be used to reserve space within the data file, |
|
199 // calculate offsets, and so on. |
|
200 CLD2::uint32 calculateHeaderSize(CLD2::uint32 numTables); |
|
201 |
|
202 // Dump a given header to stdout as a human-readable string. |
|
203 void dumpHeader(FileHeader* header); |
|
204 |
|
205 // Verify that a given pair of scoring tables match precisely |
|
206 // If there is a problem, returns an error message; otherwise, the empty string. |
|
207 bool verify(const CLD2::ScoringTables* realData, const CLD2::ScoringTables* loadedData); |
|
208 |
|
209 // Return true iff the program is running in little-endian mode. |
|
210 bool isLittleEndian(); |
|
211 |
|
212 // Return true iff the core size assumptions are ok on this platform. |
|
213 bool coreAssumptionsOk(); |
|
214 |
|
215 } // End namespace CLD2DynamicData |
|
216 #endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_ |