browser/components/translation/cld2/internal/getonescriptspan.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/browser/components/translation/cld2/internal/getonescriptspan.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,110 @@
     1.4 +// Copyright 2013 Google Inc. All Rights Reserved.
     1.5 +//
     1.6 +// Licensed under the Apache License, Version 2.0 (the "License");
     1.7 +// you may not use this file except in compliance with the License.
     1.8 +// You may obtain a copy of the License at
     1.9 +//
    1.10 +//     http://www.apache.org/licenses/LICENSE-2.0
    1.11 +//
    1.12 +// Unless required by applicable law or agreed to in writing, software
    1.13 +// distributed under the License is distributed on an "AS IS" BASIS,
    1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    1.15 +// See the License for the specific language governing permissions and
    1.16 +// limitations under the License.
    1.17 +
    1.18 +//
    1.19 +// Author: dsites@google.com (Dick Sites)
    1.20 +//
    1.21 +
    1.22 +
    1.23 +#ifndef I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
    1.24 +#define I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
    1.25 +
    1.26 +#include "integral_types.h"
    1.27 +#include "langspan.h"
    1.28 +#include "offsetmap.h"
    1.29 +
    1.30 +namespace CLD2 {
    1.31 +
    1.32 +static const int kMaxScriptBuffer = 40960;
    1.33 +static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
    1.34 +static const int kMaxScriptBytes = kMaxScriptBuffer - 32;   // Leave some room
    1.35 +static const int kWithinScriptTail = 32;    // Stop at word space in last
    1.36 +                                            // N bytes of script buffer
    1.37 +
    1.38 +
    1.39 +static inline bool IsContinuationByte(char c) {
    1.40 +  return static_cast<signed char>(c) < -64;
    1.41 +}
    1.42 +
    1.43 +// Gets lscript number for letters; always returns
    1.44 +//   0 (common script) for non-letters
    1.45 +int GetUTF8LetterScriptNum(const char* src);
    1.46 +
    1.47 +// Update src pointer to point to next quadgram, +2..+5
    1.48 +// Looks at src[0..4]
    1.49 +const char* AdvanceQuad(const char* src);
    1.50 +
    1.51 +
    1.52 +class ScriptScanner {
    1.53 + public:
    1.54 +  ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
    1.55 +  ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text,
    1.56 +                bool any_text, bool any_script);
    1.57 +  ~ScriptScanner();
    1.58 +
    1.59 +  // Copy next run of same-script non-tag letters to buffer [NUL terminated]
    1.60 +  bool GetOneScriptSpan(LangSpan* span);
    1.61 +
    1.62 +  // Force Latin and Cyrillic scripts to be lowercase
    1.63 +  void LowerScriptSpan(LangSpan* span);
    1.64 +
    1.65 +  // Copy next run of same-script non-tag letters to buffer [NUL terminated]
    1.66 +  // Force Latin and Cyrillic scripts to be lowercase
    1.67 +  bool GetOneScriptSpanLower(LangSpan* span);
    1.68 +
    1.69 +  // Copy next run of non-tag characters to buffer [NUL terminated]
    1.70 +  // This just removes tags and removes entities
    1.71 +  // Buffer has leading space
    1.72 +  bool GetOneTextSpan(LangSpan* span);
    1.73 +
    1.74 +  // Maps byte offset in most recent GetOneScriptSpan/Lower
    1.75 +  // span->text [0..text_bytes] into an additional byte offset from
    1.76 +  // span->offset, to get back to corresponding text in the original
    1.77 +  // input buffer.
    1.78 +  // text_offset must be the first byte
    1.79 +  // of a UTF-8 character, or just beyond the last character. Normally this
    1.80 +  // routine is called with the first byte of an interesting range and
    1.81 +  // again with the first byte of the following range.
    1.82 +  int MapBack(int text_offset);
    1.83 +
    1.84 +  const char* GetBufferStart() {return start_byte_;};
    1.85 +
    1.86 + private:
    1.87 +  // Skip over tags and non-letters
    1.88 +  int SkipToFrontOfSpan(const char* src, int len, int* script);
    1.89 +
    1.90 +  const char* start_byte_;        // Starting byte of buffer to scan
    1.91 +  const char* next_byte_;         // First unscanned byte
    1.92 +  const char* next_byte_limit_;   // Last byte + 1
    1.93 +  int byte_length_;               // Bytes left: next_byte_limit_ - next_byte_
    1.94 +
    1.95 +  bool is_plain_text_;            // true fo text, false for HTML
    1.96 +  char* script_buffer_;           // Holds text with expanded entities
    1.97 +  char* script_buffer_lower_;     // Holds lowercased text
    1.98 +  bool letters_marks_only_;       // To distinguish scriptspan of one
    1.99 +                                  // letters/marks vs. any mixture of text
   1.100 +  bool one_script_only_;          // To distinguish scriptspan of one
   1.101 +                                  // script vs. any mixture of scripts
   1.102 +  int exit_state_;                // For tag parser kTagParseTbl_0, based
   1.103 +                                  // on letters_marks_only_
   1.104 + public :
   1.105 +  // Expose for debugging
   1.106 +  OffsetMap map2original_;    // map from script_buffer_ to buffer
   1.107 +  OffsetMap map2uplow_;       // map from script_buffer_lower_ to script_buffer_
   1.108 +};
   1.109 +
   1.110 +}  // namespace CLD2
   1.111 +
   1.112 +#endif  // I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
   1.113 +

mercurial