1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/graphite2/src/inc/UtfCodec.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,207 @@ 1.4 +/* GRAPHITE2 LICENSING 1.5 + 1.6 + Copyright 2011, SIL International 1.7 + All rights reserved. 1.8 + 1.9 + This library is free software; you can redistribute it and/or modify 1.10 + it under the terms of the GNU Lesser General Public License as published 1.11 + by the Free Software Foundation; either version 2.1 of License, or 1.12 + (at your option) any later version. 1.13 + 1.14 + This program is distributed in the hope that it will be useful, 1.15 + but WITHOUT ANY WARRANTY; without even the implied warranty of 1.16 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 1.17 + Lesser General Public License for more details. 1.18 + 1.19 + You should also have received a copy of the GNU Lesser General Public 1.20 + License along with this library in the file named "LICENSE". 1.21 + If not, write to the Free Software Foundation, 51 Franklin Street, 1.22 + Suite 500, Boston, MA 02110-1335, USA or visit their web page on the 1.23 + internet at http://www.fsf.org/licenses/lgpl.html. 1.24 + 1.25 +Alternatively, the contents of this file may be used under the terms of the 1.26 +Mozilla Public License (http://mozilla.org/MPL) or the GNU General Public 1.27 +License, as published by the Free Software Foundation, either version 2 1.28 +of the License or (at your option) any later version. 1.29 +*/ 1.30 +#pragma once 1.31 + 1.32 +#include <cstdlib> 1.33 +#include "inc/Main.h" 1.34 + 1.35 +namespace graphite2 { 1.36 + 1.37 +typedef uint32 uchar_t; 1.38 + 1.39 +template <int N> 1.40 +struct _utf_codec 1.41 +{ 1.42 + typedef uchar_t codeunit_t; 1.43 + 1.44 + static void put(codeunit_t * cp, const uchar_t , int8 & len) throw(); 1.45 + static uchar_t get(const codeunit_t * cp, int8 & len) throw(); 1.46 +}; 1.47 + 1.48 + 1.49 +template <> 1.50 +struct _utf_codec<32> 1.51 +{ 1.52 +private: 1.53 + static const uchar_t limit = 0x110000; 1.54 +public: 1.55 + typedef uint32 codeunit_t; 1.56 + 1.57 + inline 1.58 + static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw() 1.59 + { 1.60 + *cp = usv; l = 1; 1.61 + } 1.62 + 1.63 + inline 1.64 + static uchar_t get(const codeunit_t * cp, int8 & l) throw() 1.65 + { 1.66 + if (cp[0] < limit) { l = 1; return cp[0]; } 1.67 + else { l = -1; return 0xFFFD; } 1.68 + } 1.69 +}; 1.70 + 1.71 + 1.72 +template <> 1.73 +struct _utf_codec<16> 1.74 +{ 1.75 +private: 1.76 + static const int32 lead_offset = 0xD800 - (0x10000 >> 10); 1.77 + static const int32 surrogate_offset = 0x10000 - (0xD800 << 10) - 0xDC00; 1.78 +public: 1.79 + typedef uint16 codeunit_t; 1.80 + 1.81 + inline 1.82 + static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw() 1.83 + { 1.84 + if (usv < 0x10000) { l = 1; cp[0] = codeunit_t(usv); } 1.85 + else 1.86 + { 1.87 + cp[0] = codeunit_t(lead_offset + (usv >> 10)); 1.88 + cp[1] = codeunit_t(0xDC00 + (usv & 0x3FF)); 1.89 + l = 2; 1.90 + } 1.91 + } 1.92 + 1.93 + inline 1.94 + static uchar_t get(const codeunit_t * cp, int8 & l) throw() 1.95 + { 1.96 + const uint32 uh = cp[0]; 1.97 + l = 1; 1.98 + 1.99 + if (0xD800 > uh || uh > 0xDFFF) { return uh; } 1.100 + const uint32 ul = cp[1]; 1.101 + if (uh > 0xDBFF || 0xDC00 > ul || ul > 0xDFFF) { l = -1; return 0xFFFD; } 1.102 + ++l; 1.103 + return (uh<<10) + ul + surrogate_offset; 1.104 + } 1.105 +}; 1.106 + 1.107 + 1.108 +template <> 1.109 +struct _utf_codec<8> 1.110 +{ 1.111 +private: 1.112 + static const int8 sz_lut[16]; 1.113 + static const byte mask_lut[5]; 1.114 + 1.115 + 1.116 +public: 1.117 + typedef uint8 codeunit_t; 1.118 + 1.119 + inline 1.120 + static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw() 1.121 + { 1.122 + if (usv < 0x80) {l = 1; cp[0] = usv; return; } 1.123 + if (usv < 0x0800) {l = 2; cp[0] = 0xC0 + (usv >> 6); cp[1] = 0x80 + (usv & 0x3F); return; } 1.124 + if (usv < 0x10000) {l = 3; cp[0] = 0xE0 + (usv >> 12); cp[1] = 0x80 + ((usv >> 6) & 0x3F); cp[2] = 0x80 + (usv & 0x3F); return; } 1.125 + else {l = 4; cp[0] = 0xF0 + (usv >> 18); cp[1] = 0x80 + ((usv >> 12) & 0x3F); cp[2] = 0x80 + ((usv >> 6) & 0x3F); cp[3] = 0x80 + (usv & 0x3F); return; } 1.126 + } 1.127 + 1.128 + inline 1.129 + static uchar_t get(const codeunit_t * cp, int8 & l) throw() 1.130 + { 1.131 + const int8 seq_sz = sz_lut[*cp >> 4]; 1.132 + uchar_t u = *cp & mask_lut[seq_sz]; 1.133 + l = 1; 1.134 + bool toolong = false; 1.135 + 1.136 + switch(seq_sz) { 1.137 + case 4: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong = (u < 0x10); // no break 1.138 + case 3: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong |= (u < 0x20); // no break 1.139 + case 2: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong |= (u < 0x80); // no break 1.140 + case 1: break; 1.141 + case 0: l = -1; return 0xFFFD; 1.142 + } 1.143 + 1.144 + if (l != seq_sz || toolong) 1.145 + { 1.146 + l = -l; 1.147 + return 0xFFFD; 1.148 + } 1.149 + return u; 1.150 + } 1.151 +}; 1.152 + 1.153 + 1.154 +template <typename C> 1.155 +class _utf_iterator 1.156 +{ 1.157 + typedef _utf_codec<sizeof(C)*8> codec; 1.158 + 1.159 + C * cp; 1.160 + mutable int8 sl; 1.161 + 1.162 +public: 1.163 + typedef C codeunit_type; 1.164 + typedef uchar_t value_type; 1.165 + typedef uchar_t * pointer; 1.166 + 1.167 + class reference 1.168 + { 1.169 + const _utf_iterator & _i; 1.170 + 1.171 + reference(const _utf_iterator & i): _i(i) {} 1.172 + public: 1.173 + operator value_type () const throw () { return codec::get(_i.cp, _i.sl); } 1.174 + reference & operator = (const value_type usv) throw() { codec::put(_i.cp, usv, _i.sl); return *this; } 1.175 + 1.176 + friend class _utf_iterator; 1.177 + }; 1.178 + 1.179 + 1.180 + _utf_iterator(const void * us=0) : cp(reinterpret_cast<C *>(const_cast<void *>(us))), sl(1) { } 1.181 + 1.182 + _utf_iterator & operator ++ () { cp += abs(sl); return *this; } 1.183 + _utf_iterator operator ++ (int) { _utf_iterator tmp(*this); operator++(); return tmp; } 1.184 + 1.185 + bool operator == (const _utf_iterator & rhs) const throw() { return cp >= rhs.cp; } 1.186 + bool operator != (const _utf_iterator & rhs) const throw() { return !operator==(rhs); } 1.187 + 1.188 + reference operator * () const throw() { return *this; } 1.189 + pointer operator ->() const throw() { return &operator *(); } 1.190 + 1.191 + operator codeunit_type * () const throw() { return cp; } 1.192 + 1.193 + bool error() const throw() { return sl < 1; } 1.194 +}; 1.195 + 1.196 +template <typename C> 1.197 +struct utf 1.198 +{ 1.199 + typedef typename _utf_codec<sizeof(C)*8>::codeunit_t codeunit_t; 1.200 + 1.201 + typedef _utf_iterator<C> iterator; 1.202 + typedef _utf_iterator<const C> const_iterator; 1.203 +}; 1.204 + 1.205 + 1.206 +typedef utf<uint32> utf32; 1.207 +typedef utf<uint16> utf16; 1.208 +typedef utf<uint8> utf8; 1.209 + 1.210 +} // namespace graphite2