browser/components/translation/cld2/internal/offsetmap.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/browser/components/translation/cld2/internal/offsetmap.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,175 @@
     1.4 +// Copyright 2013 Google Inc. All Rights Reserved.
     1.5 +//
     1.6 +// Licensed under the Apache License, Version 2.0 (the "License");
     1.7 +// you may not use this file except in compliance with the License.
     1.8 +// You may obtain a copy of the License at
     1.9 +//
    1.10 +//     http://www.apache.org/licenses/LICENSE-2.0
    1.11 +//
    1.12 +// Unless required by applicable law or agreed to in writing, software
    1.13 +// distributed under the License is distributed on an "AS IS" BASIS,
    1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    1.15 +// See the License for the specific language governing permissions and
    1.16 +// limitations under the License.
    1.17 +
    1.18 +//
    1.19 +// Author: dsites@google.com (Dick Sites)
    1.20 +//
    1.21 +
    1.22 +#ifndef UTIL_UTF8_OFFSETMAP_H_
    1.23 +#define UTIL_UTF8_OFFSETMAP_H_
    1.24 +
    1.25 +#include <string>                       // for string
    1.26 +#include "integral_types.h"             // for uint32
    1.27 +
    1.28 +// ***************************** OffsetMap **************************
    1.29 +//
    1.30 +// An OffsetMap object is a container for a mapping from offsets in one text
    1.31 +// buffer A' to offsets in another text buffer A. It is most useful when A' is
    1.32 +// built from A via substitutions that occasionally do not preserve byte length.
    1.33 +//
    1.34 +// A series of operators are used to build the correspondence map, then
    1.35 +// calls can be made to map an offset in A' to an offset in A, or vice versa.
    1.36 +// The map starts with offset 0 in A corresponding to offset 0 in A'.
    1.37 +// The mapping is then built sequentially, adding on byte ranges that are
    1.38 +// identical in A and A', byte ranges that are inserted in A', and byte ranges
    1.39 +// that are deleted from A. All bytes beyond those specified when building the
    1.40 +// map are assumed to correspond, i.e. a Copy(infinity) is assumed at the
    1.41 +// end of the map.
    1.42 +//
    1.43 +// The internal data structure records positions at which bytes are added or
    1.44 +// deleted. Using the map is O(1) when increasing the A' or A offset
    1.45 +// monotonically, and O(n) when accessing random offsets, where n is the
    1.46 +// number of differences.
    1.47 +//
    1.48 +
    1.49 +namespace CLD2 {
    1.50 +
    1.51 +class OffsetMap {
    1.52 + public:
    1.53 +  // Constructor, destructor
    1.54 +  OffsetMap();
    1.55 +  ~OffsetMap();
    1.56 +
    1.57 +  // Clear the map
    1.58 +  void Clear();
    1.59 +
    1.60 +  // Add to  mapping from A to A', specifying how many next bytes correspond
    1.61 +  // in A and A'
    1.62 +  void Copy(int bytes);
    1.63 +
    1.64 +  // Add to mapping from A to A', specifying how many next bytes are
    1.65 +  // inserted in A' while not advancing in A at all
    1.66 +  void Insert(int bytes);
    1.67 +
    1.68 +  // Add to mapping from A to A', specifying how many next bytes are
    1.69 +  // deleted from A while not advancing in A' at all
    1.70 +  void Delete(int bytes);
    1.71 +
    1.72 +  // Print map to file, for debugging
    1.73 +  void Printmap(const char* filename);
    1.74 +
    1.75 +  // [Finish building map,] Re-position to offset 0
    1.76 +  // This call is optional; MapForward and MapBack finish building the map
    1.77 +  // if necessary
    1.78 +  void Reset();
    1.79 +
    1.80 +  // Map an offset in A' to the corresponding offset in A
    1.81 +  int MapBack(int aprimeoffset);
    1.82 +
    1.83 +  // Map an offset in A to the corresponding offset in A'
    1.84 +  int MapForward(int aoffset);
    1.85 +
    1.86 +  // h = ComposeOffsetMap(g, f), where f is a map from A to A', g is
    1.87 +  // from A' to A'' and h is from A to A''.
    1.88 +  //
    1.89 +  // Note that g->MoveForward(f->MoveForward(aoffset)) always equals
    1.90 +  // to h->MoveForward(aoffset), while
    1.91 +  // f->MoveBack(g->MoveBack(aprimeprimeoffset)) doesn't always equals
    1.92 +  // to h->MoveBack(aprimeprimeoffset). This happens when deletion in
    1.93 +  // f and insertion in g are at the same place.  For example,
    1.94 +  //
    1.95 +  // A    1   2   3   4
    1.96 +  //      ^   |  ^    ^
    1.97 +  //      |   | /     |  f
    1.98 +  //      v   vv      v
    1.99 +  // A'   1'  2'      3'
   1.100 +  //      ^   ^^      ^
   1.101 +  //      |   | \     |  g
   1.102 +  //      v   |  v    v
   1.103 +  // A''  1'' 2'' 3'' 4''
   1.104 +  //
   1.105 +  // results in:
   1.106 +  //
   1.107 +  // A    1   2   3   4
   1.108 +  //      ^   ^\  ^   ^
   1.109 +  //      |   | \ |   |  h
   1.110 +  //      v   |  vv   v
   1.111 +  // A''  1'' 2'' 3'' 4''
   1.112 +  //
   1.113 +  // 2'' is mapped 3 in the former figure, while 2'' is mapped to 2 in
   1.114 +  // the latter figure.
   1.115 +  static void ComposeOffsetMap(OffsetMap* g, OffsetMap* f, OffsetMap* h);
   1.116 +
   1.117 +  // For debugging only; writes to stderr
   1.118 +  void DumpWindow();
   1.119 +
   1.120 +  // For testing only -- force a mapping
   1.121 +  void StuffIt(const std::string& diffs, int max_aoffset, int max_aprimeoffset);
   1.122 +
   1.123 + private:
   1.124 +  enum MapOp {PREFIX_OP, COPY_OP, INSERT_OP, DELETE_OP};
   1.125 +
   1.126 +  void Flush();
   1.127 +  void FlushAll();
   1.128 +  void MaybeFlushAll();
   1.129 +  void Emit(MapOp op, int len);
   1.130 +
   1.131 +  void SetLeft();
   1.132 +  void SetRight();
   1.133 +
   1.134 +  // Back up over previous range, 1..5 bytes
   1.135 +  // Return subscript at the beginning of that. Pins at 0
   1.136 +  int Backup(int sub);
   1.137 +
   1.138 +  // Parse next range, 1..5 bytes
   1.139 +  // Return subscript just off the end of that
   1.140 +  int ParseNext(int sub, MapOp* op, int* length);
   1.141 +
   1.142 +  // Parse previous range, 1..5 bytes
   1.143 +  // Return current subscript
   1.144 +  int ParsePrevious(int sub, MapOp* op, int* length);
   1.145 +
   1.146 +  void PrintPosition(const char* str);
   1.147 +
   1.148 +  bool MoveRight();     // Returns true if OK
   1.149 +  bool MoveLeft();      // Returns true if OK
   1.150 +  void DumpString();
   1.151 +
   1.152 +  // Copies insert operations from source to dest. Returns true if no
   1.153 +  // other operations are found.
   1.154 +  static bool CopyInserts(OffsetMap* source, OffsetMap* dest);
   1.155 +
   1.156 +  // Copies delete operations from source to dest. Returns true if no other
   1.157 +  // operations are found.
   1.158 +  static bool CopyDeletes(OffsetMap* source, OffsetMap* dest);
   1.159 +
   1.160 +  std::string diffs_;
   1.161 +  MapOp pending_op_;
   1.162 +  uint32 pending_length_;
   1.163 +
   1.164 +  // Offsets in the ranges below correspond to each other, with A' = A + diff
   1.165 +  int next_diff_sub_;
   1.166 +  int current_lo_aoffset_;
   1.167 +  int current_hi_aoffset_;
   1.168 +  int current_lo_aprimeoffset_;
   1.169 +  int current_hi_aprimeoffset_;
   1.170 +  int current_diff_;
   1.171 +  int max_aoffset_;
   1.172 +  int max_aprimeoffset_;
   1.173 +};
   1.174 +
   1.175 +}  // namespace CLD2
   1.176 +
   1.177 +#endif  // UTIL_UTF8_OFFSETMAP_H_
   1.178 +

mercurial