gfx/graphite2/src/inc/UtfCodec.h

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /* GRAPHITE2 LICENSING
michael@0 2
michael@0 3 Copyright 2011, SIL International
michael@0 4 All rights reserved.
michael@0 5
michael@0 6 This library is free software; you can redistribute it and/or modify
michael@0 7 it under the terms of the GNU Lesser General Public License as published
michael@0 8 by the Free Software Foundation; either version 2.1 of License, or
michael@0 9 (at your option) any later version.
michael@0 10
michael@0 11 This program is distributed in the hope that it will be useful,
michael@0 12 but WITHOUT ANY WARRANTY; without even the implied warranty of
michael@0 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
michael@0 14 Lesser General Public License for more details.
michael@0 15
michael@0 16 You should also have received a copy of the GNU Lesser General Public
michael@0 17 License along with this library in the file named "LICENSE".
michael@0 18 If not, write to the Free Software Foundation, 51 Franklin Street,
michael@0 19 Suite 500, Boston, MA 02110-1335, USA or visit their web page on the
michael@0 20 internet at http://www.fsf.org/licenses/lgpl.html.
michael@0 21
michael@0 22 Alternatively, the contents of this file may be used under the terms of the
michael@0 23 Mozilla Public License (http://mozilla.org/MPL) or the GNU General Public
michael@0 24 License, as published by the Free Software Foundation, either version 2
michael@0 25 of the License or (at your option) any later version.
michael@0 26 */
michael@0 27 #pragma once
michael@0 28
michael@0 29 #include <cstdlib>
michael@0 30 #include "inc/Main.h"
michael@0 31
michael@0 32 namespace graphite2 {
michael@0 33
michael@0 34 typedef uint32 uchar_t;
michael@0 35
michael@0 36 template <int N>
michael@0 37 struct _utf_codec
michael@0 38 {
michael@0 39 typedef uchar_t codeunit_t;
michael@0 40
michael@0 41 static void put(codeunit_t * cp, const uchar_t , int8 & len) throw();
michael@0 42 static uchar_t get(const codeunit_t * cp, int8 & len) throw();
michael@0 43 };
michael@0 44
michael@0 45
michael@0 46 template <>
michael@0 47 struct _utf_codec<32>
michael@0 48 {
michael@0 49 private:
michael@0 50 static const uchar_t limit = 0x110000;
michael@0 51 public:
michael@0 52 typedef uint32 codeunit_t;
michael@0 53
michael@0 54 inline
michael@0 55 static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
michael@0 56 {
michael@0 57 *cp = usv; l = 1;
michael@0 58 }
michael@0 59
michael@0 60 inline
michael@0 61 static uchar_t get(const codeunit_t * cp, int8 & l) throw()
michael@0 62 {
michael@0 63 if (cp[0] < limit) { l = 1; return cp[0]; }
michael@0 64 else { l = -1; return 0xFFFD; }
michael@0 65 }
michael@0 66 };
michael@0 67
michael@0 68
michael@0 69 template <>
michael@0 70 struct _utf_codec<16>
michael@0 71 {
michael@0 72 private:
michael@0 73 static const int32 lead_offset = 0xD800 - (0x10000 >> 10);
michael@0 74 static const int32 surrogate_offset = 0x10000 - (0xD800 << 10) - 0xDC00;
michael@0 75 public:
michael@0 76 typedef uint16 codeunit_t;
michael@0 77
michael@0 78 inline
michael@0 79 static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
michael@0 80 {
michael@0 81 if (usv < 0x10000) { l = 1; cp[0] = codeunit_t(usv); }
michael@0 82 else
michael@0 83 {
michael@0 84 cp[0] = codeunit_t(lead_offset + (usv >> 10));
michael@0 85 cp[1] = codeunit_t(0xDC00 + (usv & 0x3FF));
michael@0 86 l = 2;
michael@0 87 }
michael@0 88 }
michael@0 89
michael@0 90 inline
michael@0 91 static uchar_t get(const codeunit_t * cp, int8 & l) throw()
michael@0 92 {
michael@0 93 const uint32 uh = cp[0];
michael@0 94 l = 1;
michael@0 95
michael@0 96 if (0xD800 > uh || uh > 0xDFFF) { return uh; }
michael@0 97 const uint32 ul = cp[1];
michael@0 98 if (uh > 0xDBFF || 0xDC00 > ul || ul > 0xDFFF) { l = -1; return 0xFFFD; }
michael@0 99 ++l;
michael@0 100 return (uh<<10) + ul + surrogate_offset;
michael@0 101 }
michael@0 102 };
michael@0 103
michael@0 104
michael@0 105 template <>
michael@0 106 struct _utf_codec<8>
michael@0 107 {
michael@0 108 private:
michael@0 109 static const int8 sz_lut[16];
michael@0 110 static const byte mask_lut[5];
michael@0 111
michael@0 112
michael@0 113 public:
michael@0 114 typedef uint8 codeunit_t;
michael@0 115
michael@0 116 inline
michael@0 117 static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
michael@0 118 {
michael@0 119 if (usv < 0x80) {l = 1; cp[0] = usv; return; }
michael@0 120 if (usv < 0x0800) {l = 2; cp[0] = 0xC0 + (usv >> 6); cp[1] = 0x80 + (usv & 0x3F); return; }
michael@0 121 if (usv < 0x10000) {l = 3; cp[0] = 0xE0 + (usv >> 12); cp[1] = 0x80 + ((usv >> 6) & 0x3F); cp[2] = 0x80 + (usv & 0x3F); return; }
michael@0 122 else {l = 4; cp[0] = 0xF0 + (usv >> 18); cp[1] = 0x80 + ((usv >> 12) & 0x3F); cp[2] = 0x80 + ((usv >> 6) & 0x3F); cp[3] = 0x80 + (usv & 0x3F); return; }
michael@0 123 }
michael@0 124
michael@0 125 inline
michael@0 126 static uchar_t get(const codeunit_t * cp, int8 & l) throw()
michael@0 127 {
michael@0 128 const int8 seq_sz = sz_lut[*cp >> 4];
michael@0 129 uchar_t u = *cp & mask_lut[seq_sz];
michael@0 130 l = 1;
michael@0 131 bool toolong = false;
michael@0 132
michael@0 133 switch(seq_sz) {
michael@0 134 case 4: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong = (u < 0x10); // no break
michael@0 135 case 3: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong |= (u < 0x20); // no break
michael@0 136 case 2: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong |= (u < 0x80); // no break
michael@0 137 case 1: break;
michael@0 138 case 0: l = -1; return 0xFFFD;
michael@0 139 }
michael@0 140
michael@0 141 if (l != seq_sz || toolong)
michael@0 142 {
michael@0 143 l = -l;
michael@0 144 return 0xFFFD;
michael@0 145 }
michael@0 146 return u;
michael@0 147 }
michael@0 148 };
michael@0 149
michael@0 150
michael@0 151 template <typename C>
michael@0 152 class _utf_iterator
michael@0 153 {
michael@0 154 typedef _utf_codec<sizeof(C)*8> codec;
michael@0 155
michael@0 156 C * cp;
michael@0 157 mutable int8 sl;
michael@0 158
michael@0 159 public:
michael@0 160 typedef C codeunit_type;
michael@0 161 typedef uchar_t value_type;
michael@0 162 typedef uchar_t * pointer;
michael@0 163
michael@0 164 class reference
michael@0 165 {
michael@0 166 const _utf_iterator & _i;
michael@0 167
michael@0 168 reference(const _utf_iterator & i): _i(i) {}
michael@0 169 public:
michael@0 170 operator value_type () const throw () { return codec::get(_i.cp, _i.sl); }
michael@0 171 reference & operator = (const value_type usv) throw() { codec::put(_i.cp, usv, _i.sl); return *this; }
michael@0 172
michael@0 173 friend class _utf_iterator;
michael@0 174 };
michael@0 175
michael@0 176
michael@0 177 _utf_iterator(const void * us=0) : cp(reinterpret_cast<C *>(const_cast<void *>(us))), sl(1) { }
michael@0 178
michael@0 179 _utf_iterator & operator ++ () { cp += abs(sl); return *this; }
michael@0 180 _utf_iterator operator ++ (int) { _utf_iterator tmp(*this); operator++(); return tmp; }
michael@0 181
michael@0 182 bool operator == (const _utf_iterator & rhs) const throw() { return cp >= rhs.cp; }
michael@0 183 bool operator != (const _utf_iterator & rhs) const throw() { return !operator==(rhs); }
michael@0 184
michael@0 185 reference operator * () const throw() { return *this; }
michael@0 186 pointer operator ->() const throw() { return &operator *(); }
michael@0 187
michael@0 188 operator codeunit_type * () const throw() { return cp; }
michael@0 189
michael@0 190 bool error() const throw() { return sl < 1; }
michael@0 191 };
michael@0 192
michael@0 193 template <typename C>
michael@0 194 struct utf
michael@0 195 {
michael@0 196 typedef typename _utf_codec<sizeof(C)*8>::codeunit_t codeunit_t;
michael@0 197
michael@0 198 typedef _utf_iterator<C> iterator;
michael@0 199 typedef _utf_iterator<const C> const_iterator;
michael@0 200 };
michael@0 201
michael@0 202
michael@0 203 typedef utf<uint32> utf32;
michael@0 204 typedef utf<uint16> utf16;
michael@0 205 typedef utf<uint8> utf8;
michael@0 206
michael@0 207 } // namespace graphite2

mercurial