1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/translation/cld2/internal/cld2_dynamic_data.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,216 @@ 1.4 +// Copyright 2014 Google Inc. All Rights Reserved. 1.5 +// 1.6 +// Licensed under the Apache License, Version 2.0 (the "License"); 1.7 +// you may not use this file except in compliance with the License. 1.8 +// You may obtain a copy of the License at 1.9 +// 1.10 +// http://www.apache.org/licenses/LICENSE-2.0 1.11 +// 1.12 +// Unless required by applicable law or agreed to in writing, software 1.13 +// distributed under the License is distributed on an "AS IS" BASIS, 1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.15 +// See the License for the specific language governing permissions and 1.16 +// limitations under the License. 1.17 + 1.18 +#ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_ 1.19 +#define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_ 1.20 + 1.21 +#include "integral_types.h" 1.22 +#include "cld2tablesummary.h" 1.23 +#include "utf8statetable.h" 1.24 +#include "scoreonescriptspan.h" 1.25 + 1.26 +/* 1.27 + There are two primary parts to a CLD2 dynamic data file: 1.28 + 1. A header, wherein trivial data, block lengths and block offsets are kept 1.29 + 2. A data block, wherein the large binary blocks are kept 1.30 + 1.31 + By reading the header, an application can determine the offsets and lengths of 1.32 + all the data blocks for all tables. Offsets in the header are expressed 1.33 + relative to the first byte of the file, inclusive of the header itself; thus, 1.34 + any offset whose value is less than the length of the header is invalid. 1.35 + 1.36 + Any offset whose value is zero indicates a field that is null in the 1.37 + underlying CLD2 data; a real example of this is the fast_state field of the 1.38 + UTF8PropObj, which may be null. 1.39 + 1.40 + The size of the header can be precalculated by calling calculateHeaderSize(), 1.41 + which will indicate the exact size of the header for a data file that contains 1.42 + a given number of CLD2TableSummary objects. 1.43 + 1.44 + Notes on endianness: 1.45 + The data format is only suitable for little-endian machines. For big-endian 1.46 + systems, a tedious transformation would need to be made first to reverse the 1.47 + byte order of significant portions of the binary - not just the lengths, but 1.48 + also some of the underlying table data. 1.49 + 1.50 + Note on 32/64 bit: 1.51 + The data format is agnostic to 32/64 bit pointers. All the offsets within the 1.52 + data blob itself are 32-bit values relative to the start of the file, and the 1.53 + file should certainly never be gigabytes in size! 1.54 + When the file is ultimately read by the loading code and mmap()'d, new 1.55 + pointers are generated at whatever size the system uses, initialized to the 1.56 + start of the mmap, and incremented by the 32-bit offset. This should be safe 1.57 + regardless of 32- or 64-bit architectures. 1.58 + 1.59 + -------------------------------------------------------------------- 1.60 + FIELD 1.61 + -------------------------------------------------------------------- 1.62 + DATA_FILE_MARKER (no null terminator) 1.63 + total file size (sanity check, uint32) 1.64 + -------------------------------------------------------------------- 1.65 + UTF8PropObj: const uint32 state0 1.66 + UTF8PropObj: const uint32 state0_size 1.67 + UTF8PropObj: const uint32 total_size 1.68 + UTF8PropObj: const int max_expand 1.69 + UTF8PropObj: const int entry_shift (coerced to 32 bits) 1.70 + UTF8PropObj: const int bytes_per_entry (coerced to 32 bits) 1.71 + UTF8PropObj: const uint32 losub 1.72 + UTF8PropObj: const uint32 hiadd 1.73 + offset of UTF8PropObj: const uint8* state_table 1.74 + length of UTF8PropObj: const uint8* state_table 1.75 + offset of UTF8PropObj: const RemapEntry* remap_base (4-byte struct) 1.76 + length of UTF8PropObj: const RemapEntry* remap_base (4-byte struct) 1.77 + offset of UTF8PropObj: const uint8* remap_string 1.78 + length of UTF8PropObj: const uint8* remap_string 1.79 + offset of UTF8PropObj: const uint8* fast_state 1.80 + length of UTF8PropObj: const uint8* fast_state 1.81 + -------------------------------------------------------------------- 1.82 + start of const short kAvgDeltaOctaScore[] 1.83 + length of const short kAvgDeltaOctaScore[] 1.84 + -------------------------------------------------------------------- 1.85 + number of CLD2TableSummary objects encoded (n) 1.86 + [Table 1]: CLD2TableSummary: uint32 kCLDTableSizeOne 1.87 + [Table 1]: CLD2TableSummary: uint32 kCLDTableSize 1.88 + [Table 1]: CLD2TableSummary: uint32 kCLDTableKeyMask 1.89 + [Table 1]: CLD2TableSummary: uint32 kCLDTableBuildDate 1.90 + [Table 1]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable 1.91 + [Table 1]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable 1.92 + [Table 1]: offset of CLD2TableSummary: const uint32* kCLDTableInd 1.93 + [Table 1]: length of CLD2TableSummary: const uint32* kCLDTableInd 1.94 + [Table 1]: offset of CLD2TableSummary: const char* kRecognizedLangScripts 1.95 + [Table 1]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1 1.96 + . 1.97 + . 1.98 + . 1.99 + [Table n]: CLD2TableSummary: uint32 kCLDTableSizeOne 1.100 + [Table n]: CLD2TableSummary: uint32 kCLDTableSize 1.101 + [Table n]: CLD2TableSummary: uint32 kCLDTableKeyMask 1.102 + [Table n]: CLD2TableSummary: uint32 kCLDTableBuildDate 1.103 + [Table n]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable 1.104 + [Table n]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable 1.105 + [Table n]: offset of CLD2TableSummary: const uint32* kCLDTableInd 1.106 + [Table n]: length of CLD2TableSummary: const uint32* kCLDTableInd 1.107 + [Table n]: offset of CLD2TableSummary: const char* kRecognizedLangScripts 1.108 + [Table n]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1 1.109 + -------------------------------------------------------------------- 1.110 + 1.111 + 1.112 + Immediately after the header fields comes the data block. The data block has 1.113 + the following content, in this order (note that padding is applied in order to 1.114 + keep lookups word-aligned): 1.115 + 1.116 + UTF8PropObj: const uint8* state_table 1.117 + UTF8PropObj: const RemapEntry* remap_base (4-byte struct) 1.118 + UTF8PropObj: const uint8* remap_string 1.119 + UTF8PropObj: const uint8* fast_state 1.120 + const short kAvgDeltaOctaScore[] 1.121 + [Table 1]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable 1.122 + [Table 1]: CLD2TableSummary: const uint32* kCLDTableInd 1.123 + [Table 1]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator) 1.124 + . 1.125 + . 1.126 + . 1.127 + [Table n]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable 1.128 + [Table n]: CLD2TableSummary: const uint32* kCLDTableInd 1.129 + [Table n]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator) 1.130 + 1.131 + 1.132 + It is STRONGLY recommended that the chunks within the data block be kept 1.133 + 128-bit aligned for efficiency reasons, although the code will work without 1.134 + such alignment: the main lookup tables have randomly-accessed groups of four 1.135 + 4-byte entries, and these must be 16-byte aligned to avoid the performance 1.136 + cost of multiple cache misses per group. 1.137 +*/ 1.138 +namespace CLD2DynamicData { 1.139 + 1.140 +static const char* DATA_FILE_MARKER = "cld2_data_file00"; 1.141 +static const int DATA_FILE_MARKER_LENGTH = 16; // Keep aligned to 128 bits 1.142 + 1.143 +// Nicer version of memcmp that shows the offset at which bytes differ 1.144 +bool mem_compare(const void* data1, const void* data2, const int length); 1.145 + 1.146 +// Enable or disable debugging; 0 to disable, 1 to enable 1.147 +void setDebug(int debug); 1.148 + 1.149 +// Lower-level structure for individual tables. There are n table headers in 1.150 +// a given file header. 1.151 +typedef struct { 1.152 + CLD2::uint32 kCLDTableSizeOne; 1.153 + CLD2::uint32 kCLDTableSize; 1.154 + CLD2::uint32 kCLDTableKeyMask; 1.155 + CLD2::uint32 kCLDTableBuildDate; 1.156 + CLD2::uint32 startOf_kCLDTable; 1.157 + CLD2::uint32 lengthOf_kCLDTable; 1.158 + CLD2::uint32 startOf_kCLDTableInd; 1.159 + CLD2::uint32 lengthOf_kCLDTableInd; 1.160 + CLD2::uint32 startOf_kRecognizedLangScripts; 1.161 + CLD2::uint32 lengthOf_kRecognizedLangScripts; 1.162 +} TableHeader; 1.163 + 1.164 + 1.165 +// Top-level structure for a CLD2 Data File Header. 1.166 +// Contains all the primitive fields for the header as well as an array of 1.167 +// headers for the individual tables. 1.168 +typedef struct { 1.169 + // Marker fields help recognize and verify the data file 1.170 + char sanityString[DATA_FILE_MARKER_LENGTH]; 1.171 + CLD2::uint32 totalFileSizeBytes; 1.172 + 1.173 + // UTF8 primitives 1.174 + CLD2::uint32 utf8PropObj_state0; 1.175 + CLD2::uint32 utf8PropObj_state0_size; 1.176 + CLD2::uint32 utf8PropObj_total_size; 1.177 + CLD2::uint32 utf8PropObj_max_expand; 1.178 + CLD2::uint32 utf8PropObj_entry_shift; 1.179 + CLD2::uint32 utf8PropObj_bytes_per_entry; 1.180 + CLD2::uint32 utf8PropObj_losub; 1.181 + CLD2::uint32 utf8PropObj_hiadd; 1.182 + CLD2::uint32 startOf_utf8PropObj_state_table; 1.183 + CLD2::uint32 lengthOf_utf8PropObj_state_table; 1.184 + CLD2::uint32 startOf_utf8PropObj_remap_base; 1.185 + CLD2::uint32 lengthOf_utf8PropObj_remap_base; 1.186 + CLD2::uint32 startOf_utf8PropObj_remap_string; 1.187 + CLD2::uint32 lengthOf_utf8PropObj_remap_string; 1.188 + CLD2::uint32 startOf_utf8PropObj_fast_state; 1.189 + CLD2::uint32 lengthOf_utf8PropObj_fast_state; 1.190 + 1.191 + // Average delta-octa-score bits 1.192 + CLD2::uint32 startOf_kAvgDeltaOctaScore; 1.193 + CLD2::uint32 lengthOf_kAvgDeltaOctaScore; 1.194 + 1.195 + // Table bits 1.196 + CLD2::uint32 numTablesEncoded; 1.197 + TableHeader* tableHeaders; 1.198 +} FileHeader; 1.199 + 1.200 +// Calculate the exact size of a header that encodes the specified number of 1.201 +// tables. This can be used to reserve space within the data file, 1.202 +// calculate offsets, and so on. 1.203 +CLD2::uint32 calculateHeaderSize(CLD2::uint32 numTables); 1.204 + 1.205 +// Dump a given header to stdout as a human-readable string. 1.206 +void dumpHeader(FileHeader* header); 1.207 + 1.208 +// Verify that a given pair of scoring tables match precisely 1.209 +// If there is a problem, returns an error message; otherwise, the empty string. 1.210 +bool verify(const CLD2::ScoringTables* realData, const CLD2::ScoringTables* loadedData); 1.211 + 1.212 +// Return true iff the program is running in little-endian mode. 1.213 +bool isLittleEndian(); 1.214 + 1.215 +// Return true iff the core size assumptions are ok on this platform. 1.216 +bool coreAssumptionsOk(); 1.217 + 1.218 +} // End namespace CLD2DynamicData 1.219 +#endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_