browser/components/translation/cld2/internal/cld2_dynamic_data.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/browser/components/translation/cld2/internal/cld2_dynamic_data.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,216 @@
     1.4 +// Copyright 2014 Google Inc. All Rights Reserved.                                                  
     1.5 +//                                                                                                  
     1.6 +// Licensed under the Apache License, Version 2.0 (the "License");                                  
     1.7 +// you may not use this file except in compliance with the License.                                 
     1.8 +// You may obtain a copy of the License at                                                          
     1.9 +//                                                                                                  
    1.10 +//     http://www.apache.org/licenses/LICENSE-2.0                                                   
    1.11 +//                                                                                                  
    1.12 +// Unless required by applicable law or agreed to in writing, software                              
    1.13 +// distributed under the License is distributed on an "AS IS" BASIS,                                
    1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.                         
    1.15 +// See the License for the specific language governing permissions and                              
    1.16 +// limitations under the License.
    1.17 +
    1.18 +#ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_
    1.19 +#define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_
    1.20 +
    1.21 +#include "integral_types.h"
    1.22 +#include "cld2tablesummary.h"
    1.23 +#include "utf8statetable.h"
    1.24 +#include "scoreonescriptspan.h"
    1.25 +
    1.26 +/*
    1.27 +  There are two primary parts to a CLD2 dynamic data file:
    1.28 +    1. A header, wherein trivial data, block lengths and block offsets are kept
    1.29 +    2. A data block, wherein the large binary blocks are kept
    1.30 +
    1.31 +  By reading the header, an application can determine the offsets and lengths of
    1.32 +  all the data blocks for all tables. Offsets in the header are expressed
    1.33 +  relative to the first byte of the file, inclusive of the header itself; thus,
    1.34 +  any offset whose value is less than the length of the header is invalid.
    1.35 +
    1.36 +  Any offset whose value is zero indicates a field that is null in the
    1.37 +  underlying CLD2 data; a real example of this is the fast_state field of the
    1.38 +  UTF8PropObj, which may be null.
    1.39 +
    1.40 +  The size of the header can be precalculated by calling calculateHeaderSize(),
    1.41 +  which will indicate the exact size of the header for a data file that contains
    1.42 +  a given number of CLD2TableSummary objects.
    1.43 +
    1.44 +  Notes on endianness:
    1.45 +  The data format is only suitable for little-endian machines. For big-endian
    1.46 +  systems, a tedious transformation would need to be made first to reverse the
    1.47 +  byte order of significant portions of the binary - not just the lengths, but
    1.48 +  also some of the underlying table data.
    1.49 +
    1.50 +  Note on 32/64 bit:
    1.51 +  The data format is agnostic to 32/64 bit pointers. All the offsets within the 
    1.52 +  data blob itself are 32-bit values relative to the start of the file, and the
    1.53 +  file should certainly never be gigabytes in size!
    1.54 +  When the file is ultimately read by the loading code and mmap()'d, new
    1.55 +  pointers are generated at whatever size the system uses, initialized to the
    1.56 +  start of the mmap, and incremented by the 32-bit offset. This should be safe
    1.57 +  regardless of 32- or 64-bit architectures.
    1.58 +
    1.59 +  --------------------------------------------------------------------
    1.60 +  FIELD
    1.61 +  --------------------------------------------------------------------
    1.62 +  DATA_FILE_MARKER (no null terminator)
    1.63 +  total file size (sanity check, uint32)
    1.64 +  --------------------------------------------------------------------
    1.65 +  UTF8PropObj: const uint32 state0
    1.66 +  UTF8PropObj: const uint32 state0_size
    1.67 +  UTF8PropObj: const uint32 total_size
    1.68 +  UTF8PropObj: const int max_expand
    1.69 +  UTF8PropObj: const int entry_shift (coerced to 32 bits)
    1.70 +  UTF8PropObj: const int bytes_per_entry (coerced to 32 bits)
    1.71 +  UTF8PropObj: const uint32 losub
    1.72 +  UTF8PropObj: const uint32 hiadd
    1.73 +  offset of UTF8PropObj: const uint8* state_table
    1.74 +  length of UTF8PropObj: const uint8* state_table
    1.75 +  offset of UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
    1.76 +  length of UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
    1.77 +  offset of UTF8PropObj: const uint8* remap_string
    1.78 +  length of UTF8PropObj: const uint8* remap_string
    1.79 +  offset of UTF8PropObj: const uint8* fast_state
    1.80 +  length of UTF8PropObj: const uint8* fast_state
    1.81 +  --------------------------------------------------------------------
    1.82 +  start of const short kAvgDeltaOctaScore[]
    1.83 +  length of const short kAvgDeltaOctaScore[]
    1.84 +  --------------------------------------------------------------------
    1.85 +  number of CLD2TableSummary objects encoded (n)
    1.86 +  [Table 1]: CLD2TableSummary: uint32 kCLDTableSizeOne
    1.87 +  [Table 1]: CLD2TableSummary: uint32 kCLDTableSize
    1.88 +  [Table 1]: CLD2TableSummary: uint32 kCLDTableKeyMask
    1.89 +  [Table 1]: CLD2TableSummary: uint32 kCLDTableBuildDate
    1.90 +  [Table 1]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
    1.91 +  [Table 1]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
    1.92 +  [Table 1]: offset of CLD2TableSummary: const uint32* kCLDTableInd
    1.93 +  [Table 1]: length of CLD2TableSummary: const uint32* kCLDTableInd
    1.94 +  [Table 1]: offset of CLD2TableSummary: const char* kRecognizedLangScripts
    1.95 +  [Table 1]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1
    1.96 +  .
    1.97 +  .
    1.98 +  .
    1.99 +  [Table n]: CLD2TableSummary: uint32 kCLDTableSizeOne
   1.100 +  [Table n]: CLD2TableSummary: uint32 kCLDTableSize
   1.101 +  [Table n]: CLD2TableSummary: uint32 kCLDTableKeyMask
   1.102 +  [Table n]: CLD2TableSummary: uint32 kCLDTableBuildDate
   1.103 +  [Table n]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
   1.104 +  [Table n]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
   1.105 +  [Table n]: offset of CLD2TableSummary: const uint32* kCLDTableInd
   1.106 +  [Table n]: length of CLD2TableSummary: const uint32* kCLDTableInd
   1.107 +  [Table n]: offset of CLD2TableSummary: const char* kRecognizedLangScripts
   1.108 +  [Table n]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1
   1.109 +  --------------------------------------------------------------------
   1.110 +
   1.111 +
   1.112 +  Immediately after the header fields comes the data block. The data block has
   1.113 +  the following content, in this order (note that padding is applied in order to
   1.114 +  keep lookups word-aligned):
   1.115 +
   1.116 +  UTF8PropObj: const uint8* state_table
   1.117 +  UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
   1.118 +  UTF8PropObj: const uint8* remap_string
   1.119 +  UTF8PropObj: const uint8* fast_state
   1.120 +  const short kAvgDeltaOctaScore[]
   1.121 +  [Table 1]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable
   1.122 +  [Table 1]: CLD2TableSummary: const uint32* kCLDTableInd
   1.123 +  [Table 1]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator)
   1.124 +  .
   1.125 +  .
   1.126 +  .
   1.127 +  [Table n]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable
   1.128 +  [Table n]: CLD2TableSummary: const uint32* kCLDTableInd
   1.129 +  [Table n]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator)
   1.130 +
   1.131 +
   1.132 +  It is STRONGLY recommended that the chunks within the data block be kept
   1.133 +  128-bit aligned for efficiency reasons, although the code will work without
   1.134 +  such alignment: the main lookup tables have randomly-accessed groups of four
   1.135 +  4-byte entries, and these must be 16-byte aligned to avoid the performance
   1.136 +  cost of multiple cache misses per group.
   1.137 +*/
   1.138 +namespace CLD2DynamicData {
   1.139 +
   1.140 +static const char* DATA_FILE_MARKER = "cld2_data_file00";
   1.141 +static const int DATA_FILE_MARKER_LENGTH = 16; // Keep aligned to 128 bits
   1.142 +
   1.143 +// Nicer version of memcmp that shows the offset at which bytes differ
   1.144 +bool mem_compare(const void* data1, const void* data2, const int length);
   1.145 +
   1.146 +// Enable or disable debugging; 0 to disable, 1 to enable
   1.147 +void setDebug(int debug);
   1.148 +
   1.149 +// Lower-level structure for individual tables. There are n table headers in
   1.150 +// a given file header.
   1.151 +typedef struct {
   1.152 +  CLD2::uint32 kCLDTableSizeOne;
   1.153 +  CLD2::uint32 kCLDTableSize;
   1.154 +  CLD2::uint32 kCLDTableKeyMask;
   1.155 +  CLD2::uint32 kCLDTableBuildDate;
   1.156 +  CLD2::uint32 startOf_kCLDTable;
   1.157 +  CLD2::uint32 lengthOf_kCLDTable;
   1.158 +  CLD2::uint32 startOf_kCLDTableInd;
   1.159 +  CLD2::uint32 lengthOf_kCLDTableInd;
   1.160 +  CLD2::uint32 startOf_kRecognizedLangScripts;
   1.161 +  CLD2::uint32 lengthOf_kRecognizedLangScripts;
   1.162 +} TableHeader;
   1.163 +
   1.164 +
   1.165 +// Top-level structure for a CLD2 Data File Header.
   1.166 +// Contains all the primitive fields for the header as well as an array of
   1.167 +// headers for the individual tables.
   1.168 +typedef struct {
   1.169 +  // Marker fields help recognize and verify the data file
   1.170 +  char sanityString[DATA_FILE_MARKER_LENGTH];
   1.171 +  CLD2::uint32 totalFileSizeBytes;
   1.172 +
   1.173 +  // UTF8 primitives
   1.174 +  CLD2::uint32 utf8PropObj_state0;
   1.175 +  CLD2::uint32 utf8PropObj_state0_size;
   1.176 +  CLD2::uint32 utf8PropObj_total_size;
   1.177 +  CLD2::uint32 utf8PropObj_max_expand;
   1.178 +  CLD2::uint32 utf8PropObj_entry_shift;
   1.179 +  CLD2::uint32 utf8PropObj_bytes_per_entry;
   1.180 +  CLD2::uint32 utf8PropObj_losub;
   1.181 +  CLD2::uint32 utf8PropObj_hiadd;
   1.182 +  CLD2::uint32 startOf_utf8PropObj_state_table;
   1.183 +  CLD2::uint32 lengthOf_utf8PropObj_state_table;
   1.184 +  CLD2::uint32 startOf_utf8PropObj_remap_base;
   1.185 +  CLD2::uint32 lengthOf_utf8PropObj_remap_base;
   1.186 +  CLD2::uint32 startOf_utf8PropObj_remap_string;
   1.187 +  CLD2::uint32 lengthOf_utf8PropObj_remap_string;
   1.188 +  CLD2::uint32 startOf_utf8PropObj_fast_state;
   1.189 +  CLD2::uint32 lengthOf_utf8PropObj_fast_state;
   1.190 +
   1.191 +  // Average delta-octa-score bits
   1.192 +  CLD2::uint32 startOf_kAvgDeltaOctaScore;
   1.193 +  CLD2::uint32 lengthOf_kAvgDeltaOctaScore;
   1.194 +
   1.195 +  // Table bits
   1.196 +  CLD2::uint32 numTablesEncoded;
   1.197 +  TableHeader* tableHeaders;
   1.198 +} FileHeader;
   1.199 +
   1.200 +// Calculate the exact size of a header that encodes the specified number of
   1.201 +// tables. This can be used to reserve space within the data file,
   1.202 +// calculate offsets, and so on.
   1.203 +CLD2::uint32 calculateHeaderSize(CLD2::uint32 numTables);
   1.204 +
   1.205 +// Dump a given header to stdout as a human-readable string.
   1.206 +void dumpHeader(FileHeader* header);
   1.207 +
   1.208 +// Verify that a given pair of scoring tables match precisely
   1.209 +// If there is a problem, returns an error message; otherwise, the empty string.
   1.210 +bool verify(const CLD2::ScoringTables* realData, const CLD2::ScoringTables* loadedData);
   1.211 +
   1.212 +// Return true iff the program is running in little-endian mode.
   1.213 +bool isLittleEndian();
   1.214 +
   1.215 +// Return true iff the core size assumptions are ok on this platform.
   1.216 +bool coreAssumptionsOk();
   1.217 +
   1.218 +} // End namespace CLD2DynamicData
   1.219 +#endif  // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_

mercurial