browser/components/translation/cld2/internal/getonescriptspan.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright 2013 Google Inc. All Rights Reserved.
     2 //
     3 // Licensed under the Apache License, Version 2.0 (the "License");
     4 // you may not use this file except in compliance with the License.
     5 // You may obtain a copy of the License at
     6 //
     7 //     http://www.apache.org/licenses/LICENSE-2.0
     8 //
     9 // Unless required by applicable law or agreed to in writing, software
    10 // distributed under the License is distributed on an "AS IS" BASIS,
    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12 // See the License for the specific language governing permissions and
    13 // limitations under the License.
    15 //
    16 // Author: dsites@google.com (Dick Sites)
    17 //
    20 #ifndef I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
    21 #define I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
    23 #include "integral_types.h"
    24 #include "langspan.h"
    25 #include "offsetmap.h"
    27 namespace CLD2 {
    29 static const int kMaxScriptBuffer = 40960;
    30 static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
    31 static const int kMaxScriptBytes = kMaxScriptBuffer - 32;   // Leave some room
    32 static const int kWithinScriptTail = 32;    // Stop at word space in last
    33                                             // N bytes of script buffer
    36 static inline bool IsContinuationByte(char c) {
    37   return static_cast<signed char>(c) < -64;
    38 }
    40 // Gets lscript number for letters; always returns
    41 //   0 (common script) for non-letters
    42 int GetUTF8LetterScriptNum(const char* src);
    44 // Update src pointer to point to next quadgram, +2..+5
    45 // Looks at src[0..4]
    46 const char* AdvanceQuad(const char* src);
    49 class ScriptScanner {
    50  public:
    51   ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
    52   ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text,
    53                 bool any_text, bool any_script);
    54   ~ScriptScanner();
    56   // Copy next run of same-script non-tag letters to buffer [NUL terminated]
    57   bool GetOneScriptSpan(LangSpan* span);
    59   // Force Latin and Cyrillic scripts to be lowercase
    60   void LowerScriptSpan(LangSpan* span);
    62   // Copy next run of same-script non-tag letters to buffer [NUL terminated]
    63   // Force Latin and Cyrillic scripts to be lowercase
    64   bool GetOneScriptSpanLower(LangSpan* span);
    66   // Copy next run of non-tag characters to buffer [NUL terminated]
    67   // This just removes tags and removes entities
    68   // Buffer has leading space
    69   bool GetOneTextSpan(LangSpan* span);
    71   // Maps byte offset in most recent GetOneScriptSpan/Lower
    72   // span->text [0..text_bytes] into an additional byte offset from
    73   // span->offset, to get back to corresponding text in the original
    74   // input buffer.
    75   // text_offset must be the first byte
    76   // of a UTF-8 character, or just beyond the last character. Normally this
    77   // routine is called with the first byte of an interesting range and
    78   // again with the first byte of the following range.
    79   int MapBack(int text_offset);
    81   const char* GetBufferStart() {return start_byte_;};
    83  private:
    84   // Skip over tags and non-letters
    85   int SkipToFrontOfSpan(const char* src, int len, int* script);
    87   const char* start_byte_;        // Starting byte of buffer to scan
    88   const char* next_byte_;         // First unscanned byte
    89   const char* next_byte_limit_;   // Last byte + 1
    90   int byte_length_;               // Bytes left: next_byte_limit_ - next_byte_
    92   bool is_plain_text_;            // true fo text, false for HTML
    93   char* script_buffer_;           // Holds text with expanded entities
    94   char* script_buffer_lower_;     // Holds lowercased text
    95   bool letters_marks_only_;       // To distinguish scriptspan of one
    96                                   // letters/marks vs. any mixture of text
    97   bool one_script_only_;          // To distinguish scriptspan of one
    98                                   // script vs. any mixture of scripts
    99   int exit_state_;                // For tag parser kTagParseTbl_0, based
   100                                   // on letters_marks_only_
   101  public :
   102   // Expose for debugging
   103   OffsetMap map2original_;    // map from script_buffer_ to buffer
   104   OffsetMap map2uplow_;       // map from script_buffer_lower_ to script_buffer_
   105 };
   107 }  // namespace CLD2
   109 #endif  // I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_

mercurial