browser/components/translation/cld2/internal/cld2_dynamic_data.h

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 // Copyright 2014 Google Inc. All Rights Reserved.                                                  
     2 //                                                                                                  
     3 // Licensed under the Apache License, Version 2.0 (the "License");                                  
     4 // you may not use this file except in compliance with the License.                                 
     5 // You may obtain a copy of the License at                                                          
     6 //                                                                                                  
     7 //     http://www.apache.org/licenses/LICENSE-2.0                                                   
     8 //                                                                                                  
     9 // Unless required by applicable law or agreed to in writing, software                              
    10 // distributed under the License is distributed on an "AS IS" BASIS,                                
    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.                         
    12 // See the License for the specific language governing permissions and                              
    13 // limitations under the License.
    15 #ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_
    16 #define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_
    18 #include "integral_types.h"
    19 #include "cld2tablesummary.h"
    20 #include "utf8statetable.h"
    21 #include "scoreonescriptspan.h"
    23 /*
    24   There are two primary parts to a CLD2 dynamic data file:
    25     1. A header, wherein trivial data, block lengths and block offsets are kept
    26     2. A data block, wherein the large binary blocks are kept
    28   By reading the header, an application can determine the offsets and lengths of
    29   all the data blocks for all tables. Offsets in the header are expressed
    30   relative to the first byte of the file, inclusive of the header itself; thus,
    31   any offset whose value is less than the length of the header is invalid.
    33   Any offset whose value is zero indicates a field that is null in the
    34   underlying CLD2 data; a real example of this is the fast_state field of the
    35   UTF8PropObj, which may be null.
    37   The size of the header can be precalculated by calling calculateHeaderSize(),
    38   which will indicate the exact size of the header for a data file that contains
    39   a given number of CLD2TableSummary objects.
    41   Notes on endianness:
    42   The data format is only suitable for little-endian machines. For big-endian
    43   systems, a tedious transformation would need to be made first to reverse the
    44   byte order of significant portions of the binary - not just the lengths, but
    45   also some of the underlying table data.
    47   Note on 32/64 bit:
    48   The data format is agnostic to 32/64 bit pointers. All the offsets within the 
    49   data blob itself are 32-bit values relative to the start of the file, and the
    50   file should certainly never be gigabytes in size!
    51   When the file is ultimately read by the loading code and mmap()'d, new
    52   pointers are generated at whatever size the system uses, initialized to the
    53   start of the mmap, and incremented by the 32-bit offset. This should be safe
    54   regardless of 32- or 64-bit architectures.
    56   --------------------------------------------------------------------
    57   FIELD
    58   --------------------------------------------------------------------
    59   DATA_FILE_MARKER (no null terminator)
    60   total file size (sanity check, uint32)
    61   --------------------------------------------------------------------
    62   UTF8PropObj: const uint32 state0
    63   UTF8PropObj: const uint32 state0_size
    64   UTF8PropObj: const uint32 total_size
    65   UTF8PropObj: const int max_expand
    66   UTF8PropObj: const int entry_shift (coerced to 32 bits)
    67   UTF8PropObj: const int bytes_per_entry (coerced to 32 bits)
    68   UTF8PropObj: const uint32 losub
    69   UTF8PropObj: const uint32 hiadd
    70   offset of UTF8PropObj: const uint8* state_table
    71   length of UTF8PropObj: const uint8* state_table
    72   offset of UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
    73   length of UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
    74   offset of UTF8PropObj: const uint8* remap_string
    75   length of UTF8PropObj: const uint8* remap_string
    76   offset of UTF8PropObj: const uint8* fast_state
    77   length of UTF8PropObj: const uint8* fast_state
    78   --------------------------------------------------------------------
    79   start of const short kAvgDeltaOctaScore[]
    80   length of const short kAvgDeltaOctaScore[]
    81   --------------------------------------------------------------------
    82   number of CLD2TableSummary objects encoded (n)
    83   [Table 1]: CLD2TableSummary: uint32 kCLDTableSizeOne
    84   [Table 1]: CLD2TableSummary: uint32 kCLDTableSize
    85   [Table 1]: CLD2TableSummary: uint32 kCLDTableKeyMask
    86   [Table 1]: CLD2TableSummary: uint32 kCLDTableBuildDate
    87   [Table 1]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
    88   [Table 1]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
    89   [Table 1]: offset of CLD2TableSummary: const uint32* kCLDTableInd
    90   [Table 1]: length of CLD2TableSummary: const uint32* kCLDTableInd
    91   [Table 1]: offset of CLD2TableSummary: const char* kRecognizedLangScripts
    92   [Table 1]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1
    93   .
    94   .
    95   .
    96   [Table n]: CLD2TableSummary: uint32 kCLDTableSizeOne
    97   [Table n]: CLD2TableSummary: uint32 kCLDTableSize
    98   [Table n]: CLD2TableSummary: uint32 kCLDTableKeyMask
    99   [Table n]: CLD2TableSummary: uint32 kCLDTableBuildDate
   100   [Table n]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
   101   [Table n]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
   102   [Table n]: offset of CLD2TableSummary: const uint32* kCLDTableInd
   103   [Table n]: length of CLD2TableSummary: const uint32* kCLDTableInd
   104   [Table n]: offset of CLD2TableSummary: const char* kRecognizedLangScripts
   105   [Table n]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1
   106   --------------------------------------------------------------------
   109   Immediately after the header fields comes the data block. The data block has
   110   the following content, in this order (note that padding is applied in order to
   111   keep lookups word-aligned):
   113   UTF8PropObj: const uint8* state_table
   114   UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
   115   UTF8PropObj: const uint8* remap_string
   116   UTF8PropObj: const uint8* fast_state
   117   const short kAvgDeltaOctaScore[]
   118   [Table 1]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable
   119   [Table 1]: CLD2TableSummary: const uint32* kCLDTableInd
   120   [Table 1]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator)
   121   .
   122   .
   123   .
   124   [Table n]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable
   125   [Table n]: CLD2TableSummary: const uint32* kCLDTableInd
   126   [Table n]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator)
   129   It is STRONGLY recommended that the chunks within the data block be kept
   130   128-bit aligned for efficiency reasons, although the code will work without
   131   such alignment: the main lookup tables have randomly-accessed groups of four
   132   4-byte entries, and these must be 16-byte aligned to avoid the performance
   133   cost of multiple cache misses per group.
   134 */
   135 namespace CLD2DynamicData {
   137 static const char* DATA_FILE_MARKER = "cld2_data_file00";
   138 static const int DATA_FILE_MARKER_LENGTH = 16; // Keep aligned to 128 bits
   140 // Nicer version of memcmp that shows the offset at which bytes differ
   141 bool mem_compare(const void* data1, const void* data2, const int length);
   143 // Enable or disable debugging; 0 to disable, 1 to enable
   144 void setDebug(int debug);
   146 // Lower-level structure for individual tables. There are n table headers in
   147 // a given file header.
   148 typedef struct {
   149   CLD2::uint32 kCLDTableSizeOne;
   150   CLD2::uint32 kCLDTableSize;
   151   CLD2::uint32 kCLDTableKeyMask;
   152   CLD2::uint32 kCLDTableBuildDate;
   153   CLD2::uint32 startOf_kCLDTable;
   154   CLD2::uint32 lengthOf_kCLDTable;
   155   CLD2::uint32 startOf_kCLDTableInd;
   156   CLD2::uint32 lengthOf_kCLDTableInd;
   157   CLD2::uint32 startOf_kRecognizedLangScripts;
   158   CLD2::uint32 lengthOf_kRecognizedLangScripts;
   159 } TableHeader;
   162 // Top-level structure for a CLD2 Data File Header.
   163 // Contains all the primitive fields for the header as well as an array of
   164 // headers for the individual tables.
   165 typedef struct {
   166   // Marker fields help recognize and verify the data file
   167   char sanityString[DATA_FILE_MARKER_LENGTH];
   168   CLD2::uint32 totalFileSizeBytes;
   170   // UTF8 primitives
   171   CLD2::uint32 utf8PropObj_state0;
   172   CLD2::uint32 utf8PropObj_state0_size;
   173   CLD2::uint32 utf8PropObj_total_size;
   174   CLD2::uint32 utf8PropObj_max_expand;
   175   CLD2::uint32 utf8PropObj_entry_shift;
   176   CLD2::uint32 utf8PropObj_bytes_per_entry;
   177   CLD2::uint32 utf8PropObj_losub;
   178   CLD2::uint32 utf8PropObj_hiadd;
   179   CLD2::uint32 startOf_utf8PropObj_state_table;
   180   CLD2::uint32 lengthOf_utf8PropObj_state_table;
   181   CLD2::uint32 startOf_utf8PropObj_remap_base;
   182   CLD2::uint32 lengthOf_utf8PropObj_remap_base;
   183   CLD2::uint32 startOf_utf8PropObj_remap_string;
   184   CLD2::uint32 lengthOf_utf8PropObj_remap_string;
   185   CLD2::uint32 startOf_utf8PropObj_fast_state;
   186   CLD2::uint32 lengthOf_utf8PropObj_fast_state;
   188   // Average delta-octa-score bits
   189   CLD2::uint32 startOf_kAvgDeltaOctaScore;
   190   CLD2::uint32 lengthOf_kAvgDeltaOctaScore;
   192   // Table bits
   193   CLD2::uint32 numTablesEncoded;
   194   TableHeader* tableHeaders;
   195 } FileHeader;
   197 // Calculate the exact size of a header that encodes the specified number of
   198 // tables. This can be used to reserve space within the data file,
   199 // calculate offsets, and so on.
   200 CLD2::uint32 calculateHeaderSize(CLD2::uint32 numTables);
   202 // Dump a given header to stdout as a human-readable string.
   203 void dumpHeader(FileHeader* header);
   205 // Verify that a given pair of scoring tables match precisely
   206 // If there is a problem, returns an error message; otherwise, the empty string.
   207 bool verify(const CLD2::ScoringTables* realData, const CLD2::ScoringTables* loadedData);
   209 // Return true iff the program is running in little-endian mode.
   210 bool isLittleEndian();
   212 // Return true iff the core size assumptions are ok on this platform.
   213 bool coreAssumptionsOk();
   215 } // End namespace CLD2DynamicData
   216 #endif  // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_

mercurial