browser/components/translation/cld2/internal/getonescriptspan.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 // Copyright 2013 Google Inc. All Rights Reserved.
michael@0 2 //
michael@0 3 // Licensed under the Apache License, Version 2.0 (the "License");
michael@0 4 // you may not use this file except in compliance with the License.
michael@0 5 // You may obtain a copy of the License at
michael@0 6 //
michael@0 7 // http://www.apache.org/licenses/LICENSE-2.0
michael@0 8 //
michael@0 9 // Unless required by applicable law or agreed to in writing, software
michael@0 10 // distributed under the License is distributed on an "AS IS" BASIS,
michael@0 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
michael@0 12 // See the License for the specific language governing permissions and
michael@0 13 // limitations under the License.
michael@0 14
michael@0 15 //
michael@0 16 // Author: dsites@google.com (Dick Sites)
michael@0 17 //
michael@0 18
michael@0 19
michael@0 20 #ifndef I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
michael@0 21 #define I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
michael@0 22
michael@0 23 #include "integral_types.h"
michael@0 24 #include "langspan.h"
michael@0 25 #include "offsetmap.h"
michael@0 26
michael@0 27 namespace CLD2 {
michael@0 28
michael@0 29 static const int kMaxScriptBuffer = 40960;
michael@0 30 static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
michael@0 31 static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
michael@0 32 static const int kWithinScriptTail = 32; // Stop at word space in last
michael@0 33 // N bytes of script buffer
michael@0 34
michael@0 35
michael@0 36 static inline bool IsContinuationByte(char c) {
michael@0 37 return static_cast<signed char>(c) < -64;
michael@0 38 }
michael@0 39
michael@0 40 // Gets lscript number for letters; always returns
michael@0 41 // 0 (common script) for non-letters
michael@0 42 int GetUTF8LetterScriptNum(const char* src);
michael@0 43
michael@0 44 // Update src pointer to point to next quadgram, +2..+5
michael@0 45 // Looks at src[0..4]
michael@0 46 const char* AdvanceQuad(const char* src);
michael@0 47
michael@0 48
michael@0 49 class ScriptScanner {
michael@0 50 public:
michael@0 51 ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
michael@0 52 ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text,
michael@0 53 bool any_text, bool any_script);
michael@0 54 ~ScriptScanner();
michael@0 55
michael@0 56 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
michael@0 57 bool GetOneScriptSpan(LangSpan* span);
michael@0 58
michael@0 59 // Force Latin and Cyrillic scripts to be lowercase
michael@0 60 void LowerScriptSpan(LangSpan* span);
michael@0 61
michael@0 62 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
michael@0 63 // Force Latin and Cyrillic scripts to be lowercase
michael@0 64 bool GetOneScriptSpanLower(LangSpan* span);
michael@0 65
michael@0 66 // Copy next run of non-tag characters to buffer [NUL terminated]
michael@0 67 // This just removes tags and removes entities
michael@0 68 // Buffer has leading space
michael@0 69 bool GetOneTextSpan(LangSpan* span);
michael@0 70
michael@0 71 // Maps byte offset in most recent GetOneScriptSpan/Lower
michael@0 72 // span->text [0..text_bytes] into an additional byte offset from
michael@0 73 // span->offset, to get back to corresponding text in the original
michael@0 74 // input buffer.
michael@0 75 // text_offset must be the first byte
michael@0 76 // of a UTF-8 character, or just beyond the last character. Normally this
michael@0 77 // routine is called with the first byte of an interesting range and
michael@0 78 // again with the first byte of the following range.
michael@0 79 int MapBack(int text_offset);
michael@0 80
michael@0 81 const char* GetBufferStart() {return start_byte_;};
michael@0 82
michael@0 83 private:
michael@0 84 // Skip over tags and non-letters
michael@0 85 int SkipToFrontOfSpan(const char* src, int len, int* script);
michael@0 86
michael@0 87 const char* start_byte_; // Starting byte of buffer to scan
michael@0 88 const char* next_byte_; // First unscanned byte
michael@0 89 const char* next_byte_limit_; // Last byte + 1
michael@0 90 int byte_length_; // Bytes left: next_byte_limit_ - next_byte_
michael@0 91
michael@0 92 bool is_plain_text_; // true fo text, false for HTML
michael@0 93 char* script_buffer_; // Holds text with expanded entities
michael@0 94 char* script_buffer_lower_; // Holds lowercased text
michael@0 95 bool letters_marks_only_; // To distinguish scriptspan of one
michael@0 96 // letters/marks vs. any mixture of text
michael@0 97 bool one_script_only_; // To distinguish scriptspan of one
michael@0 98 // script vs. any mixture of scripts
michael@0 99 int exit_state_; // For tag parser kTagParseTbl_0, based
michael@0 100 // on letters_marks_only_
michael@0 101 public :
michael@0 102 // Expose for debugging
michael@0 103 OffsetMap map2original_; // map from script_buffer_ to buffer
michael@0 104 OffsetMap map2uplow_; // map from script_buffer_lower_ to script_buffer_
michael@0 105 };
michael@0 106
michael@0 107 } // namespace CLD2
michael@0 108
michael@0 109 #endif // I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
michael@0 110

mercurial