Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
1 /* GRAPHITE2 LICENSING
3 Copyright 2011, SIL International
4 All rights reserved.
6 This library is free software; you can redistribute it and/or modify
7 it under the terms of the GNU Lesser General Public License as published
8 by the Free Software Foundation; either version 2.1 of License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should also have received a copy of the GNU Lesser General Public
17 License along with this library in the file named "LICENSE".
18 If not, write to the Free Software Foundation, 51 Franklin Street,
19 Suite 500, Boston, MA 02110-1335, USA or visit their web page on the
20 internet at http://www.fsf.org/licenses/lgpl.html.
22 Alternatively, the contents of this file may be used under the terms of the
23 Mozilla Public License (http://mozilla.org/MPL) or the GNU General Public
24 License, as published by the Free Software Foundation, either version 2
25 of the License or (at your option) any later version.
26 */
27 #pragma once
29 #include <cstdlib>
30 #include "inc/Main.h"
32 namespace graphite2 {
34 typedef uint32 uchar_t;
36 template <int N>
37 struct _utf_codec
38 {
39 typedef uchar_t codeunit_t;
41 static void put(codeunit_t * cp, const uchar_t , int8 & len) throw();
42 static uchar_t get(const codeunit_t * cp, int8 & len) throw();
43 };
46 template <>
47 struct _utf_codec<32>
48 {
49 private:
50 static const uchar_t limit = 0x110000;
51 public:
52 typedef uint32 codeunit_t;
54 inline
55 static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
56 {
57 *cp = usv; l = 1;
58 }
60 inline
61 static uchar_t get(const codeunit_t * cp, int8 & l) throw()
62 {
63 if (cp[0] < limit) { l = 1; return cp[0]; }
64 else { l = -1; return 0xFFFD; }
65 }
66 };
69 template <>
70 struct _utf_codec<16>
71 {
72 private:
73 static const int32 lead_offset = 0xD800 - (0x10000 >> 10);
74 static const int32 surrogate_offset = 0x10000 - (0xD800 << 10) - 0xDC00;
75 public:
76 typedef uint16 codeunit_t;
78 inline
79 static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
80 {
81 if (usv < 0x10000) { l = 1; cp[0] = codeunit_t(usv); }
82 else
83 {
84 cp[0] = codeunit_t(lead_offset + (usv >> 10));
85 cp[1] = codeunit_t(0xDC00 + (usv & 0x3FF));
86 l = 2;
87 }
88 }
90 inline
91 static uchar_t get(const codeunit_t * cp, int8 & l) throw()
92 {
93 const uint32 uh = cp[0];
94 l = 1;
96 if (0xD800 > uh || uh > 0xDFFF) { return uh; }
97 const uint32 ul = cp[1];
98 if (uh > 0xDBFF || 0xDC00 > ul || ul > 0xDFFF) { l = -1; return 0xFFFD; }
99 ++l;
100 return (uh<<10) + ul + surrogate_offset;
101 }
102 };
105 template <>
106 struct _utf_codec<8>
107 {
108 private:
109 static const int8 sz_lut[16];
110 static const byte mask_lut[5];
113 public:
114 typedef uint8 codeunit_t;
116 inline
117 static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
118 {
119 if (usv < 0x80) {l = 1; cp[0] = usv; return; }
120 if (usv < 0x0800) {l = 2; cp[0] = 0xC0 + (usv >> 6); cp[1] = 0x80 + (usv & 0x3F); return; }
121 if (usv < 0x10000) {l = 3; cp[0] = 0xE0 + (usv >> 12); cp[1] = 0x80 + ((usv >> 6) & 0x3F); cp[2] = 0x80 + (usv & 0x3F); return; }
122 else {l = 4; cp[0] = 0xF0 + (usv >> 18); cp[1] = 0x80 + ((usv >> 12) & 0x3F); cp[2] = 0x80 + ((usv >> 6) & 0x3F); cp[3] = 0x80 + (usv & 0x3F); return; }
123 }
125 inline
126 static uchar_t get(const codeunit_t * cp, int8 & l) throw()
127 {
128 const int8 seq_sz = sz_lut[*cp >> 4];
129 uchar_t u = *cp & mask_lut[seq_sz];
130 l = 1;
131 bool toolong = false;
133 switch(seq_sz) {
134 case 4: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong = (u < 0x10); // no break
135 case 3: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong |= (u < 0x20); // no break
136 case 2: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong |= (u < 0x80); // no break
137 case 1: break;
138 case 0: l = -1; return 0xFFFD;
139 }
141 if (l != seq_sz || toolong)
142 {
143 l = -l;
144 return 0xFFFD;
145 }
146 return u;
147 }
148 };
151 template <typename C>
152 class _utf_iterator
153 {
154 typedef _utf_codec<sizeof(C)*8> codec;
156 C * cp;
157 mutable int8 sl;
159 public:
160 typedef C codeunit_type;
161 typedef uchar_t value_type;
162 typedef uchar_t * pointer;
164 class reference
165 {
166 const _utf_iterator & _i;
168 reference(const _utf_iterator & i): _i(i) {}
169 public:
170 operator value_type () const throw () { return codec::get(_i.cp, _i.sl); }
171 reference & operator = (const value_type usv) throw() { codec::put(_i.cp, usv, _i.sl); return *this; }
173 friend class _utf_iterator;
174 };
177 _utf_iterator(const void * us=0) : cp(reinterpret_cast<C *>(const_cast<void *>(us))), sl(1) { }
179 _utf_iterator & operator ++ () { cp += abs(sl); return *this; }
180 _utf_iterator operator ++ (int) { _utf_iterator tmp(*this); operator++(); return tmp; }
182 bool operator == (const _utf_iterator & rhs) const throw() { return cp >= rhs.cp; }
183 bool operator != (const _utf_iterator & rhs) const throw() { return !operator==(rhs); }
185 reference operator * () const throw() { return *this; }
186 pointer operator ->() const throw() { return &operator *(); }
188 operator codeunit_type * () const throw() { return cp; }
190 bool error() const throw() { return sl < 1; }
191 };
193 template <typename C>
194 struct utf
195 {
196 typedef typename _utf_codec<sizeof(C)*8>::codeunit_t codeunit_t;
198 typedef _utf_iterator<C> iterator;
199 typedef _utf_iterator<const C> const_iterator;
200 };
203 typedef utf<uint32> utf32;
204 typedef utf<uint16> utf16;
205 typedef utf<uint8> utf8;
207 } // namespace graphite2