security/sandbox/chromium/base/strings/utf_string_conversion_utils.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/security/sandbox/chromium/base/strings/utf_string_conversion_utils.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,148 @@
     1.4 +// Copyright (c) 2009 The Chromium Authors. All rights reserved.
     1.5 +// Use of this source code is governed by a BSD-style license that can be
     1.6 +// found in the LICENSE file.
     1.7 +
     1.8 +#include "base/strings/utf_string_conversion_utils.h"
     1.9 +
    1.10 +#include "base/third_party/icu/icu_utf.h"
    1.11 +
    1.12 +namespace base {
    1.13 +
    1.14 +// ReadUnicodeCharacter --------------------------------------------------------
    1.15 +
    1.16 +bool ReadUnicodeCharacter(const char* src,
    1.17 +                          int32 src_len,
    1.18 +                          int32* char_index,
    1.19 +                          uint32* code_point_out) {
    1.20 +  // U8_NEXT expects to be able to use -1 to signal an error, so we must
    1.21 +  // use a signed type for code_point.  But this function returns false
    1.22 +  // on error anyway, so code_point_out is unsigned.
    1.23 +  int32 code_point;
    1.24 +  CBU8_NEXT(src, *char_index, src_len, code_point);
    1.25 +  *code_point_out = static_cast<uint32>(code_point);
    1.26 +
    1.27 +  // The ICU macro above moves to the next char, we want to point to the last
    1.28 +  // char consumed.
    1.29 +  (*char_index)--;
    1.30 +
    1.31 +  // Validate the decoded value.
    1.32 +  return IsValidCodepoint(code_point);
    1.33 +}
    1.34 +
    1.35 +bool ReadUnicodeCharacter(const char16* src,
    1.36 +                          int32 src_len,
    1.37 +                          int32* char_index,
    1.38 +                          uint32* code_point) {
    1.39 +  if (CBU16_IS_SURROGATE(src[*char_index])) {
    1.40 +    if (!CBU16_IS_SURROGATE_LEAD(src[*char_index]) ||
    1.41 +        *char_index + 1 >= src_len ||
    1.42 +        !CBU16_IS_TRAIL(src[*char_index + 1])) {
    1.43 +      // Invalid surrogate pair.
    1.44 +      return false;
    1.45 +    }
    1.46 +
    1.47 +    // Valid surrogate pair.
    1.48 +    *code_point = CBU16_GET_SUPPLEMENTARY(src[*char_index],
    1.49 +                                          src[*char_index + 1]);
    1.50 +    (*char_index)++;
    1.51 +  } else {
    1.52 +    // Not a surrogate, just one 16-bit word.
    1.53 +    *code_point = src[*char_index];
    1.54 +  }
    1.55 +
    1.56 +  return IsValidCodepoint(*code_point);
    1.57 +}
    1.58 +
    1.59 +#if defined(WCHAR_T_IS_UTF32)
    1.60 +bool ReadUnicodeCharacter(const wchar_t* src,
    1.61 +                          int32 src_len,
    1.62 +                          int32* char_index,
    1.63 +                          uint32* code_point) {
    1.64 +  // Conversion is easy since the source is 32-bit.
    1.65 +  *code_point = src[*char_index];
    1.66 +
    1.67 +  // Validate the value.
    1.68 +  return IsValidCodepoint(*code_point);
    1.69 +}
    1.70 +#endif  // defined(WCHAR_T_IS_UTF32)
    1.71 +
    1.72 +// WriteUnicodeCharacter -------------------------------------------------------
    1.73 +
    1.74 +size_t WriteUnicodeCharacter(uint32 code_point, std::string* output) {
    1.75 +  if (code_point <= 0x7f) {
    1.76 +    // Fast path the common case of one byte.
    1.77 +    output->push_back(code_point);
    1.78 +    return 1;
    1.79 +  }
    1.80 +
    1.81 +
    1.82 +  // CBU8_APPEND_UNSAFE can append up to 4 bytes.
    1.83 +  size_t char_offset = output->length();
    1.84 +  size_t original_char_offset = char_offset;
    1.85 +  output->resize(char_offset + CBU8_MAX_LENGTH);
    1.86 +
    1.87 +  CBU8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
    1.88 +
    1.89 +  // CBU8_APPEND_UNSAFE will advance our pointer past the inserted character, so
    1.90 +  // it will represent the new length of the string.
    1.91 +  output->resize(char_offset);
    1.92 +  return char_offset - original_char_offset;
    1.93 +}
    1.94 +
    1.95 +size_t WriteUnicodeCharacter(uint32 code_point, string16* output) {
    1.96 +  if (CBU16_LENGTH(code_point) == 1) {
    1.97 +    // Thie code point is in the Basic Multilingual Plane (BMP).
    1.98 +    output->push_back(static_cast<char16>(code_point));
    1.99 +    return 1;
   1.100 +  }
   1.101 +  // Non-BMP characters use a double-character encoding.
   1.102 +  size_t char_offset = output->length();
   1.103 +  output->resize(char_offset + CBU16_MAX_LENGTH);
   1.104 +  CBU16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
   1.105 +  return CBU16_MAX_LENGTH;
   1.106 +}
   1.107 +
   1.108 +// Generalized Unicode converter -----------------------------------------------
   1.109 +
   1.110 +template<typename CHAR>
   1.111 +void PrepareForUTF8Output(const CHAR* src,
   1.112 +                          size_t src_len,
   1.113 +                          std::string* output) {
   1.114 +  output->clear();
   1.115 +  if (src_len == 0)
   1.116 +    return;
   1.117 +  if (src[0] < 0x80) {
   1.118 +    // Assume that the entire input will be ASCII.
   1.119 +    output->reserve(src_len);
   1.120 +  } else {
   1.121 +    // Assume that the entire input is non-ASCII and will have 3 bytes per char.
   1.122 +    output->reserve(src_len * 3);
   1.123 +  }
   1.124 +}
   1.125 +
   1.126 +// Instantiate versions we know callers will need.
   1.127 +template void PrepareForUTF8Output(const wchar_t*, size_t, std::string*);
   1.128 +template void PrepareForUTF8Output(const char16*, size_t, std::string*);
   1.129 +
   1.130 +template<typename STRING>
   1.131 +void PrepareForUTF16Or32Output(const char* src,
   1.132 +                               size_t src_len,
   1.133 +                               STRING* output) {
   1.134 +  output->clear();
   1.135 +  if (src_len == 0)
   1.136 +    return;
   1.137 +  if (static_cast<unsigned char>(src[0]) < 0x80) {
   1.138 +    // Assume the input is all ASCII, which means 1:1 correspondence.
   1.139 +    output->reserve(src_len);
   1.140 +  } else {
   1.141 +    // Otherwise assume that the UTF-8 sequences will have 2 bytes for each
   1.142 +    // character.
   1.143 +    output->reserve(src_len / 2);
   1.144 +  }
   1.145 +}
   1.146 +
   1.147 +// Instantiate versions we know callers will need.
   1.148 +template void PrepareForUTF16Or32Output(const char*, size_t, std::wstring*);
   1.149 +template void PrepareForUTF16Or32Output(const char*, size_t, string16*);
   1.150 +
   1.151 +}  // namespace base

mercurial