gfx/graphite2/src/inc/UtfCodec.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/graphite2/src/inc/UtfCodec.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,207 @@
     1.4 +/*  GRAPHITE2 LICENSING
     1.5 +
     1.6 +    Copyright 2011, SIL International
     1.7 +    All rights reserved.
     1.8 +
     1.9 +    This library is free software; you can redistribute it and/or modify
    1.10 +    it under the terms of the GNU Lesser General Public License as published
    1.11 +    by the Free Software Foundation; either version 2.1 of License, or
    1.12 +    (at your option) any later version.
    1.13 +
    1.14 +    This program is distributed in the hope that it will be useful,
    1.15 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
    1.16 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    1.17 +    Lesser General Public License for more details.
    1.18 +
    1.19 +    You should also have received a copy of the GNU Lesser General Public
    1.20 +    License along with this library in the file named "LICENSE".
    1.21 +    If not, write to the Free Software Foundation, 51 Franklin Street,
    1.22 +    Suite 500, Boston, MA 02110-1335, USA or visit their web page on the
    1.23 +    internet at http://www.fsf.org/licenses/lgpl.html.
    1.24 +
    1.25 +Alternatively, the contents of this file may be used under the terms of the
    1.26 +Mozilla Public License (http://mozilla.org/MPL) or the GNU General Public
    1.27 +License, as published by the Free Software Foundation, either version 2
    1.28 +of the License or (at your option) any later version.
    1.29 +*/
    1.30 +#pragma once
    1.31 +
    1.32 +#include <cstdlib>
    1.33 +#include "inc/Main.h"
    1.34 +
    1.35 +namespace graphite2 {
    1.36 +
    1.37 +typedef uint32  uchar_t;
    1.38 +
    1.39 +template <int N>
    1.40 +struct _utf_codec
    1.41 +{
    1.42 +    typedef uchar_t codeunit_t;
    1.43 +
    1.44 +    static void     put(codeunit_t * cp, const uchar_t , int8 & len) throw();
    1.45 +    static uchar_t  get(const codeunit_t * cp, int8 & len) throw();
    1.46 +};
    1.47 +
    1.48 +
    1.49 +template <>
    1.50 +struct _utf_codec<32>
    1.51 +{
    1.52 +private:
    1.53 +    static const uchar_t    limit = 0x110000;
    1.54 +public:
    1.55 +    typedef uint32  codeunit_t;
    1.56 +
    1.57 +    inline
    1.58 +    static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
    1.59 +    {
    1.60 +        *cp = usv; l = 1;
    1.61 +    }
    1.62 +
    1.63 +    inline
    1.64 +    static uchar_t get(const codeunit_t * cp, int8 & l) throw()
    1.65 +    {
    1.66 +        if (cp[0] < limit)  { l = 1;  return cp[0]; }
    1.67 +        else                { l = -1; return 0xFFFD; }
    1.68 +    }
    1.69 +};
    1.70 +
    1.71 +
    1.72 +template <>
    1.73 +struct _utf_codec<16>
    1.74 +{
    1.75 +private:
    1.76 +    static const int32  lead_offset      = 0xD800 - (0x10000 >> 10);
    1.77 +    static const int32  surrogate_offset = 0x10000 - (0xD800 << 10) - 0xDC00;
    1.78 +public:
    1.79 +    typedef uint16  codeunit_t;
    1.80 +
    1.81 +    inline
    1.82 +    static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
    1.83 +    {
    1.84 +        if (usv < 0x10000)  { l = 1; cp[0] = codeunit_t(usv); }
    1.85 +        else
    1.86 +        {
    1.87 +            cp[0] = codeunit_t(lead_offset + (usv >> 10));
    1.88 +            cp[1] = codeunit_t(0xDC00 + (usv & 0x3FF));
    1.89 +            l = 2;
    1.90 +        }
    1.91 +    }
    1.92 +
    1.93 +    inline
    1.94 +    static uchar_t get(const codeunit_t * cp, int8 & l) throw()
    1.95 +    {
    1.96 +        const uint32    uh = cp[0];
    1.97 +        l = 1;
    1.98 +
    1.99 +        if (0xD800 > uh || uh > 0xDFFF) { return uh; }
   1.100 +        const uint32 ul = cp[1];
   1.101 +        if (uh > 0xDBFF || 0xDC00 > ul || ul > 0xDFFF) { l = -1; return 0xFFFD; }
   1.102 +        ++l;
   1.103 +        return (uh<<10) + ul + surrogate_offset;
   1.104 +    }
   1.105 +};
   1.106 +
   1.107 +
   1.108 +template <>
   1.109 +struct _utf_codec<8>
   1.110 +{
   1.111 +private:
   1.112 +    static const int8 sz_lut[16];
   1.113 +    static const byte mask_lut[5];
   1.114 +
   1.115 +
   1.116 +public:
   1.117 +    typedef uint8   codeunit_t;
   1.118 +
   1.119 +    inline
   1.120 +    static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
   1.121 +    {
   1.122 +        if (usv < 0x80)     {l = 1; cp[0] = usv; return; }
   1.123 +        if (usv < 0x0800)   {l = 2; cp[0] = 0xC0 + (usv >> 6);  cp[1] = 0x80 + (usv & 0x3F); return; }
   1.124 +        if (usv < 0x10000)  {l = 3; cp[0] = 0xE0 + (usv >> 12); cp[1] = 0x80 + ((usv >> 6) & 0x3F);  cp[2] = 0x80 + (usv & 0x3F); return; }
   1.125 +        else                {l = 4; cp[0] = 0xF0 + (usv >> 18); cp[1] = 0x80 + ((usv >> 12) & 0x3F); cp[2] = 0x80 + ((usv >> 6) & 0x3F); cp[3] = 0x80 + (usv & 0x3F); return; }
   1.126 +    }
   1.127 +
   1.128 +    inline
   1.129 +    static uchar_t get(const codeunit_t * cp, int8 & l) throw()
   1.130 +    {
   1.131 +        const int8 seq_sz = sz_lut[*cp >> 4];
   1.132 +        uchar_t u = *cp & mask_lut[seq_sz];
   1.133 +        l = 1;
   1.134 +        bool toolong = false;
   1.135 +
   1.136 +        switch(seq_sz) {
   1.137 +            case 4:     u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong  = (u < 0x10); // no break
   1.138 +            case 3:     u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong |= (u < 0x20); // no break
   1.139 +            case 2:     u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong |= (u < 0x80); // no break
   1.140 +            case 1:     break;
   1.141 +            case 0:     l = -1; return 0xFFFD;
   1.142 +        }
   1.143 +
   1.144 +        if (l != seq_sz || toolong)
   1.145 +        {
   1.146 +            l = -l;
   1.147 +            return 0xFFFD;
   1.148 +        }
   1.149 +        return u;
   1.150 +    }
   1.151 +};
   1.152 +
   1.153 +
   1.154 +template <typename C>
   1.155 +class _utf_iterator
   1.156 +{
   1.157 +    typedef _utf_codec<sizeof(C)*8> codec;
   1.158 +
   1.159 +    C             * cp;
   1.160 +    mutable int8    sl;
   1.161 +
   1.162 +public:
   1.163 +    typedef C           codeunit_type;
   1.164 +    typedef uchar_t     value_type;
   1.165 +    typedef uchar_t   * pointer;
   1.166 +
   1.167 +    class reference
   1.168 +    {
   1.169 +        const _utf_iterator & _i;
   1.170 +
   1.171 +        reference(const _utf_iterator & i): _i(i) {}
   1.172 +    public:
   1.173 +        operator value_type () const throw ()                   { return codec::get(_i.cp, _i.sl); }
   1.174 +        reference & operator = (const value_type usv) throw()   { codec::put(_i.cp, usv, _i.sl); return *this; }
   1.175 +
   1.176 +        friend class _utf_iterator;
   1.177 +    };
   1.178 +
   1.179 +
   1.180 +    _utf_iterator(const void * us=0)    : cp(reinterpret_cast<C *>(const_cast<void *>(us))), sl(1) { }
   1.181 +
   1.182 +    _utf_iterator   & operator ++ ()    { cp += abs(sl); return *this; }
   1.183 +    _utf_iterator   operator ++ (int)   { _utf_iterator tmp(*this); operator++(); return tmp; }
   1.184 +
   1.185 +    bool operator == (const _utf_iterator & rhs) const throw() { return cp >= rhs.cp; }
   1.186 +    bool operator != (const _utf_iterator & rhs) const throw() { return !operator==(rhs); }
   1.187 +
   1.188 +    reference   operator * () const throw() { return *this; }
   1.189 +    pointer     operator ->() const throw() { return &operator *(); }
   1.190 +
   1.191 +    operator codeunit_type * () const throw() { return cp; }
   1.192 +
   1.193 +    bool error() const throw()  { return sl < 1; }
   1.194 +};
   1.195 +
   1.196 +template <typename C>
   1.197 +struct utf
   1.198 +{
   1.199 +    typedef typename _utf_codec<sizeof(C)*8>::codeunit_t codeunit_t;
   1.200 +
   1.201 +    typedef _utf_iterator<C>        iterator;
   1.202 +    typedef _utf_iterator<const C>  const_iterator;
   1.203 +};
   1.204 +
   1.205 +
   1.206 +typedef utf<uint32> utf32;
   1.207 +typedef utf<uint16> utf16;
   1.208 +typedef utf<uint8>  utf8;
   1.209 +
   1.210 +} // namespace graphite2

mercurial