1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/translation/cld2/internal/getonescriptspan.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,110 @@ 1.4 +// Copyright 2013 Google Inc. All Rights Reserved. 1.5 +// 1.6 +// Licensed under the Apache License, Version 2.0 (the "License"); 1.7 +// you may not use this file except in compliance with the License. 1.8 +// You may obtain a copy of the License at 1.9 +// 1.10 +// http://www.apache.org/licenses/LICENSE-2.0 1.11 +// 1.12 +// Unless required by applicable law or agreed to in writing, software 1.13 +// distributed under the License is distributed on an "AS IS" BASIS, 1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.15 +// See the License for the specific language governing permissions and 1.16 +// limitations under the License. 1.17 + 1.18 +// 1.19 +// Author: dsites@google.com (Dick Sites) 1.20 +// 1.21 + 1.22 + 1.23 +#ifndef I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_ 1.24 +#define I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_ 1.25 + 1.26 +#include "integral_types.h" 1.27 +#include "langspan.h" 1.28 +#include "offsetmap.h" 1.29 + 1.30 +namespace CLD2 { 1.31 + 1.32 +static const int kMaxScriptBuffer = 40960; 1.33 +static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2; 1.34 +static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room 1.35 +static const int kWithinScriptTail = 32; // Stop at word space in last 1.36 + // N bytes of script buffer 1.37 + 1.38 + 1.39 +static inline bool IsContinuationByte(char c) { 1.40 + return static_cast<signed char>(c) < -64; 1.41 +} 1.42 + 1.43 +// Gets lscript number for letters; always returns 1.44 +// 0 (common script) for non-letters 1.45 +int GetUTF8LetterScriptNum(const char* src); 1.46 + 1.47 +// Update src pointer to point to next quadgram, +2..+5 1.48 +// Looks at src[0..4] 1.49 +const char* AdvanceQuad(const char* src); 1.50 + 1.51 + 1.52 +class ScriptScanner { 1.53 + public: 1.54 + ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text); 1.55 + ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text, 1.56 + bool any_text, bool any_script); 1.57 + ~ScriptScanner(); 1.58 + 1.59 + // Copy next run of same-script non-tag letters to buffer [NUL terminated] 1.60 + bool GetOneScriptSpan(LangSpan* span); 1.61 + 1.62 + // Force Latin and Cyrillic scripts to be lowercase 1.63 + void LowerScriptSpan(LangSpan* span); 1.64 + 1.65 + // Copy next run of same-script non-tag letters to buffer [NUL terminated] 1.66 + // Force Latin and Cyrillic scripts to be lowercase 1.67 + bool GetOneScriptSpanLower(LangSpan* span); 1.68 + 1.69 + // Copy next run of non-tag characters to buffer [NUL terminated] 1.70 + // This just removes tags and removes entities 1.71 + // Buffer has leading space 1.72 + bool GetOneTextSpan(LangSpan* span); 1.73 + 1.74 + // Maps byte offset in most recent GetOneScriptSpan/Lower 1.75 + // span->text [0..text_bytes] into an additional byte offset from 1.76 + // span->offset, to get back to corresponding text in the original 1.77 + // input buffer. 1.78 + // text_offset must be the first byte 1.79 + // of a UTF-8 character, or just beyond the last character. Normally this 1.80 + // routine is called with the first byte of an interesting range and 1.81 + // again with the first byte of the following range. 1.82 + int MapBack(int text_offset); 1.83 + 1.84 + const char* GetBufferStart() {return start_byte_;}; 1.85 + 1.86 + private: 1.87 + // Skip over tags and non-letters 1.88 + int SkipToFrontOfSpan(const char* src, int len, int* script); 1.89 + 1.90 + const char* start_byte_; // Starting byte of buffer to scan 1.91 + const char* next_byte_; // First unscanned byte 1.92 + const char* next_byte_limit_; // Last byte + 1 1.93 + int byte_length_; // Bytes left: next_byte_limit_ - next_byte_ 1.94 + 1.95 + bool is_plain_text_; // true fo text, false for HTML 1.96 + char* script_buffer_; // Holds text with expanded entities 1.97 + char* script_buffer_lower_; // Holds lowercased text 1.98 + bool letters_marks_only_; // To distinguish scriptspan of one 1.99 + // letters/marks vs. any mixture of text 1.100 + bool one_script_only_; // To distinguish scriptspan of one 1.101 + // script vs. any mixture of scripts 1.102 + int exit_state_; // For tag parser kTagParseTbl_0, based 1.103 + // on letters_marks_only_ 1.104 + public : 1.105 + // Expose for debugging 1.106 + OffsetMap map2original_; // map from script_buffer_ to buffer 1.107 + OffsetMap map2uplow_; // map from script_buffer_lower_ to script_buffer_ 1.108 +}; 1.109 + 1.110 +} // namespace CLD2 1.111 + 1.112 +#endif // I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_ 1.113 +