gfx/graphite2/src/inc/UtfCodec.h

branch
TOR_BUG_9701
changeset 15
b8a032363ba2
equal deleted inserted replaced
-1:000000000000 0:8f303fea2069
1 /* GRAPHITE2 LICENSING
2
3 Copyright 2011, SIL International
4 All rights reserved.
5
6 This library is free software; you can redistribute it and/or modify
7 it under the terms of the GNU Lesser General Public License as published
8 by the Free Software Foundation; either version 2.1 of License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should also have received a copy of the GNU Lesser General Public
17 License along with this library in the file named "LICENSE".
18 If not, write to the Free Software Foundation, 51 Franklin Street,
19 Suite 500, Boston, MA 02110-1335, USA or visit their web page on the
20 internet at http://www.fsf.org/licenses/lgpl.html.
21
22 Alternatively, the contents of this file may be used under the terms of the
23 Mozilla Public License (http://mozilla.org/MPL) or the GNU General Public
24 License, as published by the Free Software Foundation, either version 2
25 of the License or (at your option) any later version.
26 */
27 #pragma once
28
29 #include <cstdlib>
30 #include "inc/Main.h"
31
32 namespace graphite2 {
33
34 typedef uint32 uchar_t;
35
36 template <int N>
37 struct _utf_codec
38 {
39 typedef uchar_t codeunit_t;
40
41 static void put(codeunit_t * cp, const uchar_t , int8 & len) throw();
42 static uchar_t get(const codeunit_t * cp, int8 & len) throw();
43 };
44
45
46 template <>
47 struct _utf_codec<32>
48 {
49 private:
50 static const uchar_t limit = 0x110000;
51 public:
52 typedef uint32 codeunit_t;
53
54 inline
55 static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
56 {
57 *cp = usv; l = 1;
58 }
59
60 inline
61 static uchar_t get(const codeunit_t * cp, int8 & l) throw()
62 {
63 if (cp[0] < limit) { l = 1; return cp[0]; }
64 else { l = -1; return 0xFFFD; }
65 }
66 };
67
68
69 template <>
70 struct _utf_codec<16>
71 {
72 private:
73 static const int32 lead_offset = 0xD800 - (0x10000 >> 10);
74 static const int32 surrogate_offset = 0x10000 - (0xD800 << 10) - 0xDC00;
75 public:
76 typedef uint16 codeunit_t;
77
78 inline
79 static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
80 {
81 if (usv < 0x10000) { l = 1; cp[0] = codeunit_t(usv); }
82 else
83 {
84 cp[0] = codeunit_t(lead_offset + (usv >> 10));
85 cp[1] = codeunit_t(0xDC00 + (usv & 0x3FF));
86 l = 2;
87 }
88 }
89
90 inline
91 static uchar_t get(const codeunit_t * cp, int8 & l) throw()
92 {
93 const uint32 uh = cp[0];
94 l = 1;
95
96 if (0xD800 > uh || uh > 0xDFFF) { return uh; }
97 const uint32 ul = cp[1];
98 if (uh > 0xDBFF || 0xDC00 > ul || ul > 0xDFFF) { l = -1; return 0xFFFD; }
99 ++l;
100 return (uh<<10) + ul + surrogate_offset;
101 }
102 };
103
104
105 template <>
106 struct _utf_codec<8>
107 {
108 private:
109 static const int8 sz_lut[16];
110 static const byte mask_lut[5];
111
112
113 public:
114 typedef uint8 codeunit_t;
115
116 inline
117 static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
118 {
119 if (usv < 0x80) {l = 1; cp[0] = usv; return; }
120 if (usv < 0x0800) {l = 2; cp[0] = 0xC0 + (usv >> 6); cp[1] = 0x80 + (usv & 0x3F); return; }
121 if (usv < 0x10000) {l = 3; cp[0] = 0xE0 + (usv >> 12); cp[1] = 0x80 + ((usv >> 6) & 0x3F); cp[2] = 0x80 + (usv & 0x3F); return; }
122 else {l = 4; cp[0] = 0xF0 + (usv >> 18); cp[1] = 0x80 + ((usv >> 12) & 0x3F); cp[2] = 0x80 + ((usv >> 6) & 0x3F); cp[3] = 0x80 + (usv & 0x3F); return; }
123 }
124
125 inline
126 static uchar_t get(const codeunit_t * cp, int8 & l) throw()
127 {
128 const int8 seq_sz = sz_lut[*cp >> 4];
129 uchar_t u = *cp & mask_lut[seq_sz];
130 l = 1;
131 bool toolong = false;
132
133 switch(seq_sz) {
134 case 4: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong = (u < 0x10); // no break
135 case 3: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong |= (u < 0x20); // no break
136 case 2: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong |= (u < 0x80); // no break
137 case 1: break;
138 case 0: l = -1; return 0xFFFD;
139 }
140
141 if (l != seq_sz || toolong)
142 {
143 l = -l;
144 return 0xFFFD;
145 }
146 return u;
147 }
148 };
149
150
151 template <typename C>
152 class _utf_iterator
153 {
154 typedef _utf_codec<sizeof(C)*8> codec;
155
156 C * cp;
157 mutable int8 sl;
158
159 public:
160 typedef C codeunit_type;
161 typedef uchar_t value_type;
162 typedef uchar_t * pointer;
163
164 class reference
165 {
166 const _utf_iterator & _i;
167
168 reference(const _utf_iterator & i): _i(i) {}
169 public:
170 operator value_type () const throw () { return codec::get(_i.cp, _i.sl); }
171 reference & operator = (const value_type usv) throw() { codec::put(_i.cp, usv, _i.sl); return *this; }
172
173 friend class _utf_iterator;
174 };
175
176
177 _utf_iterator(const void * us=0) : cp(reinterpret_cast<C *>(const_cast<void *>(us))), sl(1) { }
178
179 _utf_iterator & operator ++ () { cp += abs(sl); return *this; }
180 _utf_iterator operator ++ (int) { _utf_iterator tmp(*this); operator++(); return tmp; }
181
182 bool operator == (const _utf_iterator & rhs) const throw() { return cp >= rhs.cp; }
183 bool operator != (const _utf_iterator & rhs) const throw() { return !operator==(rhs); }
184
185 reference operator * () const throw() { return *this; }
186 pointer operator ->() const throw() { return &operator *(); }
187
188 operator codeunit_type * () const throw() { return cp; }
189
190 bool error() const throw() { return sl < 1; }
191 };
192
193 template <typename C>
194 struct utf
195 {
196 typedef typename _utf_codec<sizeof(C)*8>::codeunit_t codeunit_t;
197
198 typedef _utf_iterator<C> iterator;
199 typedef _utf_iterator<const C> const_iterator;
200 };
201
202
203 typedef utf<uint32> utf32;
204 typedef utf<uint16> utf16;
205 typedef utf<uint8> utf8;
206
207 } // namespace graphite2

mercurial