browser/components/translation/cld2/internal/cld2_dynamic_data.h

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 // Copyright 2014 Google Inc. All Rights Reserved.
michael@0 2 //
michael@0 3 // Licensed under the Apache License, Version 2.0 (the "License");
michael@0 4 // you may not use this file except in compliance with the License.
michael@0 5 // You may obtain a copy of the License at
michael@0 6 //
michael@0 7 // http://www.apache.org/licenses/LICENSE-2.0
michael@0 8 //
michael@0 9 // Unless required by applicable law or agreed to in writing, software
michael@0 10 // distributed under the License is distributed on an "AS IS" BASIS,
michael@0 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
michael@0 12 // See the License for the specific language governing permissions and
michael@0 13 // limitations under the License.
michael@0 14
michael@0 15 #ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_
michael@0 16 #define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_
michael@0 17
michael@0 18 #include "integral_types.h"
michael@0 19 #include "cld2tablesummary.h"
michael@0 20 #include "utf8statetable.h"
michael@0 21 #include "scoreonescriptspan.h"
michael@0 22
michael@0 23 /*
michael@0 24 There are two primary parts to a CLD2 dynamic data file:
michael@0 25 1. A header, wherein trivial data, block lengths and block offsets are kept
michael@0 26 2. A data block, wherein the large binary blocks are kept
michael@0 27
michael@0 28 By reading the header, an application can determine the offsets and lengths of
michael@0 29 all the data blocks for all tables. Offsets in the header are expressed
michael@0 30 relative to the first byte of the file, inclusive of the header itself; thus,
michael@0 31 any offset whose value is less than the length of the header is invalid.
michael@0 32
michael@0 33 Any offset whose value is zero indicates a field that is null in the
michael@0 34 underlying CLD2 data; a real example of this is the fast_state field of the
michael@0 35 UTF8PropObj, which may be null.
michael@0 36
michael@0 37 The size of the header can be precalculated by calling calculateHeaderSize(),
michael@0 38 which will indicate the exact size of the header for a data file that contains
michael@0 39 a given number of CLD2TableSummary objects.
michael@0 40
michael@0 41 Notes on endianness:
michael@0 42 The data format is only suitable for little-endian machines. For big-endian
michael@0 43 systems, a tedious transformation would need to be made first to reverse the
michael@0 44 byte order of significant portions of the binary - not just the lengths, but
michael@0 45 also some of the underlying table data.
michael@0 46
michael@0 47 Note on 32/64 bit:
michael@0 48 The data format is agnostic to 32/64 bit pointers. All the offsets within the
michael@0 49 data blob itself are 32-bit values relative to the start of the file, and the
michael@0 50 file should certainly never be gigabytes in size!
michael@0 51 When the file is ultimately read by the loading code and mmap()'d, new
michael@0 52 pointers are generated at whatever size the system uses, initialized to the
michael@0 53 start of the mmap, and incremented by the 32-bit offset. This should be safe
michael@0 54 regardless of 32- or 64-bit architectures.
michael@0 55
michael@0 56 --------------------------------------------------------------------
michael@0 57 FIELD
michael@0 58 --------------------------------------------------------------------
michael@0 59 DATA_FILE_MARKER (no null terminator)
michael@0 60 total file size (sanity check, uint32)
michael@0 61 --------------------------------------------------------------------
michael@0 62 UTF8PropObj: const uint32 state0
michael@0 63 UTF8PropObj: const uint32 state0_size
michael@0 64 UTF8PropObj: const uint32 total_size
michael@0 65 UTF8PropObj: const int max_expand
michael@0 66 UTF8PropObj: const int entry_shift (coerced to 32 bits)
michael@0 67 UTF8PropObj: const int bytes_per_entry (coerced to 32 bits)
michael@0 68 UTF8PropObj: const uint32 losub
michael@0 69 UTF8PropObj: const uint32 hiadd
michael@0 70 offset of UTF8PropObj: const uint8* state_table
michael@0 71 length of UTF8PropObj: const uint8* state_table
michael@0 72 offset of UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
michael@0 73 length of UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
michael@0 74 offset of UTF8PropObj: const uint8* remap_string
michael@0 75 length of UTF8PropObj: const uint8* remap_string
michael@0 76 offset of UTF8PropObj: const uint8* fast_state
michael@0 77 length of UTF8PropObj: const uint8* fast_state
michael@0 78 --------------------------------------------------------------------
michael@0 79 start of const short kAvgDeltaOctaScore[]
michael@0 80 length of const short kAvgDeltaOctaScore[]
michael@0 81 --------------------------------------------------------------------
michael@0 82 number of CLD2TableSummary objects encoded (n)
michael@0 83 [Table 1]: CLD2TableSummary: uint32 kCLDTableSizeOne
michael@0 84 [Table 1]: CLD2TableSummary: uint32 kCLDTableSize
michael@0 85 [Table 1]: CLD2TableSummary: uint32 kCLDTableKeyMask
michael@0 86 [Table 1]: CLD2TableSummary: uint32 kCLDTableBuildDate
michael@0 87 [Table 1]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
michael@0 88 [Table 1]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
michael@0 89 [Table 1]: offset of CLD2TableSummary: const uint32* kCLDTableInd
michael@0 90 [Table 1]: length of CLD2TableSummary: const uint32* kCLDTableInd
michael@0 91 [Table 1]: offset of CLD2TableSummary: const char* kRecognizedLangScripts
michael@0 92 [Table 1]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1
michael@0 93 .
michael@0 94 .
michael@0 95 .
michael@0 96 [Table n]: CLD2TableSummary: uint32 kCLDTableSizeOne
michael@0 97 [Table n]: CLD2TableSummary: uint32 kCLDTableSize
michael@0 98 [Table n]: CLD2TableSummary: uint32 kCLDTableKeyMask
michael@0 99 [Table n]: CLD2TableSummary: uint32 kCLDTableBuildDate
michael@0 100 [Table n]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
michael@0 101 [Table n]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
michael@0 102 [Table n]: offset of CLD2TableSummary: const uint32* kCLDTableInd
michael@0 103 [Table n]: length of CLD2TableSummary: const uint32* kCLDTableInd
michael@0 104 [Table n]: offset of CLD2TableSummary: const char* kRecognizedLangScripts
michael@0 105 [Table n]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1
michael@0 106 --------------------------------------------------------------------
michael@0 107
michael@0 108
michael@0 109 Immediately after the header fields comes the data block. The data block has
michael@0 110 the following content, in this order (note that padding is applied in order to
michael@0 111 keep lookups word-aligned):
michael@0 112
michael@0 113 UTF8PropObj: const uint8* state_table
michael@0 114 UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
michael@0 115 UTF8PropObj: const uint8* remap_string
michael@0 116 UTF8PropObj: const uint8* fast_state
michael@0 117 const short kAvgDeltaOctaScore[]
michael@0 118 [Table 1]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable
michael@0 119 [Table 1]: CLD2TableSummary: const uint32* kCLDTableInd
michael@0 120 [Table 1]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator)
michael@0 121 .
michael@0 122 .
michael@0 123 .
michael@0 124 [Table n]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable
michael@0 125 [Table n]: CLD2TableSummary: const uint32* kCLDTableInd
michael@0 126 [Table n]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator)
michael@0 127
michael@0 128
michael@0 129 It is STRONGLY recommended that the chunks within the data block be kept
michael@0 130 128-bit aligned for efficiency reasons, although the code will work without
michael@0 131 such alignment: the main lookup tables have randomly-accessed groups of four
michael@0 132 4-byte entries, and these must be 16-byte aligned to avoid the performance
michael@0 133 cost of multiple cache misses per group.
michael@0 134 */
michael@0 135 namespace CLD2DynamicData {
michael@0 136
michael@0 137 static const char* DATA_FILE_MARKER = "cld2_data_file00";
michael@0 138 static const int DATA_FILE_MARKER_LENGTH = 16; // Keep aligned to 128 bits
michael@0 139
michael@0 140 // Nicer version of memcmp that shows the offset at which bytes differ
michael@0 141 bool mem_compare(const void* data1, const void* data2, const int length);
michael@0 142
michael@0 143 // Enable or disable debugging; 0 to disable, 1 to enable
michael@0 144 void setDebug(int debug);
michael@0 145
michael@0 146 // Lower-level structure for individual tables. There are n table headers in
michael@0 147 // a given file header.
michael@0 148 typedef struct {
michael@0 149 CLD2::uint32 kCLDTableSizeOne;
michael@0 150 CLD2::uint32 kCLDTableSize;
michael@0 151 CLD2::uint32 kCLDTableKeyMask;
michael@0 152 CLD2::uint32 kCLDTableBuildDate;
michael@0 153 CLD2::uint32 startOf_kCLDTable;
michael@0 154 CLD2::uint32 lengthOf_kCLDTable;
michael@0 155 CLD2::uint32 startOf_kCLDTableInd;
michael@0 156 CLD2::uint32 lengthOf_kCLDTableInd;
michael@0 157 CLD2::uint32 startOf_kRecognizedLangScripts;
michael@0 158 CLD2::uint32 lengthOf_kRecognizedLangScripts;
michael@0 159 } TableHeader;
michael@0 160
michael@0 161
michael@0 162 // Top-level structure for a CLD2 Data File Header.
michael@0 163 // Contains all the primitive fields for the header as well as an array of
michael@0 164 // headers for the individual tables.
michael@0 165 typedef struct {
michael@0 166 // Marker fields help recognize and verify the data file
michael@0 167 char sanityString[DATA_FILE_MARKER_LENGTH];
michael@0 168 CLD2::uint32 totalFileSizeBytes;
michael@0 169
michael@0 170 // UTF8 primitives
michael@0 171 CLD2::uint32 utf8PropObj_state0;
michael@0 172 CLD2::uint32 utf8PropObj_state0_size;
michael@0 173 CLD2::uint32 utf8PropObj_total_size;
michael@0 174 CLD2::uint32 utf8PropObj_max_expand;
michael@0 175 CLD2::uint32 utf8PropObj_entry_shift;
michael@0 176 CLD2::uint32 utf8PropObj_bytes_per_entry;
michael@0 177 CLD2::uint32 utf8PropObj_losub;
michael@0 178 CLD2::uint32 utf8PropObj_hiadd;
michael@0 179 CLD2::uint32 startOf_utf8PropObj_state_table;
michael@0 180 CLD2::uint32 lengthOf_utf8PropObj_state_table;
michael@0 181 CLD2::uint32 startOf_utf8PropObj_remap_base;
michael@0 182 CLD2::uint32 lengthOf_utf8PropObj_remap_base;
michael@0 183 CLD2::uint32 startOf_utf8PropObj_remap_string;
michael@0 184 CLD2::uint32 lengthOf_utf8PropObj_remap_string;
michael@0 185 CLD2::uint32 startOf_utf8PropObj_fast_state;
michael@0 186 CLD2::uint32 lengthOf_utf8PropObj_fast_state;
michael@0 187
michael@0 188 // Average delta-octa-score bits
michael@0 189 CLD2::uint32 startOf_kAvgDeltaOctaScore;
michael@0 190 CLD2::uint32 lengthOf_kAvgDeltaOctaScore;
michael@0 191
michael@0 192 // Table bits
michael@0 193 CLD2::uint32 numTablesEncoded;
michael@0 194 TableHeader* tableHeaders;
michael@0 195 } FileHeader;
michael@0 196
michael@0 197 // Calculate the exact size of a header that encodes the specified number of
michael@0 198 // tables. This can be used to reserve space within the data file,
michael@0 199 // calculate offsets, and so on.
michael@0 200 CLD2::uint32 calculateHeaderSize(CLD2::uint32 numTables);
michael@0 201
michael@0 202 // Dump a given header to stdout as a human-readable string.
michael@0 203 void dumpHeader(FileHeader* header);
michael@0 204
michael@0 205 // Verify that a given pair of scoring tables match precisely
michael@0 206 // If there is a problem, returns an error message; otherwise, the empty string.
michael@0 207 bool verify(const CLD2::ScoringTables* realData, const CLD2::ScoringTables* loadedData);
michael@0 208
michael@0 209 // Return true iff the program is running in little-endian mode.
michael@0 210 bool isLittleEndian();
michael@0 211
michael@0 212 // Return true iff the core size assumptions are ok on this platform.
michael@0 213 bool coreAssumptionsOk();
michael@0 214
michael@0 215 } // End namespace CLD2DynamicData
michael@0 216 #endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_

mercurial