intl/lwbrk/src/jisx4051pairtable.txt

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/lwbrk/src/jisx4051pairtable.txt	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,286 @@
     1.4 +
     1.5 +
     1.6 +
     1.7 +/* 
     1.8 +
     1.9 +   Simplification of Pair Table in JIS X 4051
    1.10 +
    1.11 +   1. The Origion Table - in 4.1.3
    1.12 +
    1.13 +   In JIS x 4051. The pair table is defined as below
    1.14 +
    1.15 +   Class of
    1.16 +   Leading    Class of Trailing Char Class
    1.17 +   Char        
    1.18 +
    1.19 +              1  2  3  4  5  6  7  8  9 10 11 12 13 13 14 14 15 16 17 18 19 20
    1.20 +                                                 *  #  *  #
    1.21 +        1     X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  E
    1.22 +        2        X  X  X  X  X                                               X
    1.23 +        3        X  X  X  X  X                                               X
    1.24 +        4        X  X  X  X  X                                               X
    1.25 +        5        X  X  X  X  X                                               X
    1.26 +        6        X  X  X  X  X                                               X
    1.27 +        7        X  X  X  X  X  X                                            X 
    1.28 +        8        X  X  X  X  X                                X              E 
    1.29 +        9        X  X  X  X  X                                               X
    1.30 +       10        X  X  X  X  X                                               X
    1.31 +       11        X  X  X  X  X                                               X
    1.32 +       12        X  X  X  X  X                                               X  
    1.33 +       13        X  X  X  X  X                    X                          X
    1.34 +       14        X  X  X  X  X                          X                    X
    1.35 +       15        X  X  X  X  X        X                       X        X     X 
    1.36 +       16        X  X  X  X  X                                   X     X     X
    1.37 +       17        X  X  X  X  X                                               E 
    1.38 +       18        X  X  X  X  X                                X  X     X     X 
    1.39 +       19     X  E  E  E  E  E  X  X  X  X  X  X  X  X  X  X  X  X  E  X  E  E
    1.40 +       20        X  X  X  X  X                                               E
    1.41 +
    1.42 +   * Same Char
    1.43 +   # Other Char
    1.44 +
    1.45 +   2. Simplified by remove the class which we do not care
    1.46 +
    1.47 +   However, since we do not care about class 13(Subscript), 14(Ruby), 
    1.48 +   19(split line note begin quote), and 20(split line note end quote) 
    1.49 +   we can simplify this par table into the following 
    1.50 +
    1.51 +   Class of
    1.52 +   Leading    Class of Trailing Char Class
    1.53 +   Char        
    1.54 +
    1.55 +              1  2  3  4  5  6  7  8  9 10 11 12 15 16 17 18 
    1.56 +                                                 
    1.57 +        1     X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X
    1.58 +        2        X  X  X  X  X                             
    1.59 +        3        X  X  X  X  X                            
    1.60 +        4        X  X  X  X  X                           
    1.61 +        5        X  X  X  X  X                          
    1.62 +        6        X  X  X  X  X                         
    1.63 +        7        X  X  X  X  X  X                      
    1.64 +        8        X  X  X  X  X                    X    
    1.65 +        9        X  X  X  X  X                                   
    1.66 +       10        X  X  X  X  X                                  
    1.67 +       11        X  X  X  X  X                                 
    1.68 +       12        X  X  X  X  X                                
    1.69 +       15        X  X  X  X  X        X           X        X    
    1.70 +       16        X  X  X  X  X                       X     X    
    1.71 +       17        X  X  X  X  X                                  
    1.72 +       18        X  X  X  X  X                    X  X     X    
    1.73 +
    1.74 +   3. Simplified by merged classes
    1.75 +
    1.76 +   After the 2 simplification, the pair table have some duplication 
    1.77 +   a. class 2, 3, 4, 5, 6,  are the same- we can merged them
    1.78 +   b. class 10, 11, 12, 17  are the same- we can merged them
    1.79 +
    1.80 +
    1.81 +   Class of
    1.82 +   Leading    Class of Trailing Char Class
    1.83 +   Char        
    1.84 +
    1.85 +              1 [a] 7  8  9 [b]15 16 18 
    1.86 +                                     
    1.87 +        1     X  X  X  X  X  X  X  X  X
    1.88 +      [a]        X                             
    1.89 +        7        X  X                      
    1.90 +        8        X              X    
    1.91 +        9        X                                   
    1.92 +      [b]        X                                  
    1.93 +       15        X        X     X     X    
    1.94 +       16        X                 X  X    
    1.95 +       18        X              X  X  X    
    1.96 +
    1.97 +
    1.98 +   4. Now we use one bit to encode weather it is breakable, and use 2 bytes
    1.99 +      for one row, then the bit table will look like:
   1.100 +
   1.101 +                 18    <-   1
   1.102 +            
   1.103 +       1  0000 0001 1111 1111  = 0x01FF
   1.104 +      [a] 0000 0000 0000 0010  = 0x0002
   1.105 +       7  0000 0000 0000 0110  = 0x0006
   1.106 +       8  0000 0000 0100 0010  = 0x0042
   1.107 +       9  0000 0000 0000 0010  = 0x0002
   1.108 +      [b] 0000 0000 0000 0010  = 0x0042
   1.109 +      15  0000 0001 0101 0010  = 0x0152
   1.110 +      16  0000 0001 1000 0010  = 0x0182
   1.111 +      17  0000 0001 1100 0010  = 0x01C2
   1.112 +
   1.113 +*/
   1.114 +
   1.115 +static uint16_t gJISx4051SimplifiedPair[9] = {
   1.116 +  0x01FF, 0x0002, 0x0006, 0x0042, 0x0002, 0x0042, 0x0152, 0x0182, 0x01C2
   1.117 +};
   1.118 +
   1.119 +PRBool XXXX::ClassesToPair(nsJISx4051Cls aCls1, nsJISx4051Cls aCls1)
   1.120 +{
   1.121 +  NS_ASSERTION( (aCls1 < 9) "invalid class");
   1.122 +  NS_ASSERTION( (aCls2 < 9) "invalid class");
   1.123 +  return ( 0 != (gJISx4051SimplifiedPair[aCls1] & (1L << aCls2) ));
   1.124 +}
   1.125 +
   1.126 +
   1.127 +#define X4051_IS_DIGIT(u) ((0x0030 >= (u)) && ((u) >= 0x0039))
   1.128 +
   1.129 +nsJISx4051Cls XXXX::GetClass(
   1.130 +   PRUnichar aChar, PRUnichar aBefore = 0, PRUnichar aAfter = 0)
   1.131 +{
   1.132 +   // take care the special case in cls 15
   1.133 +   if( ((0x2C == aChar) || (0x2E == aChar)) &&
   1.134 +       (X4051_IS_DIGIT(aBefore)) && X4051_IS_DIGIT(aAfter)))
   1.135 +   {
   1.136 +     return kJISx4051Cls_15;
   1.137 +   }
   1.138 +   
   1.139 +   nsJISx4051Cls cls;
   1.140 +   if(gSingle->Lookup(aChar, &cls))
   1.141 +     return cls;
   1.142 +
   1.143 +   if(gRange->Lookup(aChar, &cls))
   1.144 +     return cls;
   1.145 + 
   1.146 +   return kJISx4051Cls_15;
   1.147 +}
   1.148 +
   1.149 +
   1.150 +typedef enum {
   1.151 +  kJISx4051Cls_1 = 0,
   1.152 +  kJISx4051Cls_2 = 1,
   1.153 +  kJISx4051Cls_3 = 1,
   1.154 +  kJISx4051Cls_4 = 1,
   1.155 +  kJISx4051Cls_5 = 1,
   1.156 +  kJISx4051Cls_6 = 1,
   1.157 +  kJISx4051Cls_7 = 2,
   1.158 +  kJISx4051Cls_8 = 3,
   1.159 +  kJISx4051Cls_9 = 4,
   1.160 +  kJISx4051Cls_10 = 5,
   1.161 +  kJISx4051Cls_11 = 5,
   1.162 +  kJISx4051Cls_12 = 5,
   1.163 +  // kJISx4051Cls_13 = 0,
   1.164 +  // kJISx4051Cls_14 = 0,
   1.165 +  kJISx4051Cls_15 = 6,
   1.166 +  kJISx4051Cls_16 = 7,
   1.167 +  kJISx4051Cls_17 = 5,
   1.168 +  kJISx4051Cls_18 = 8,
   1.169 +  // kJISx4051Cls_19 = 0,
   1.170 +  // kJISx4051Cls_20 = 0
   1.171 +} nsJISx4051Cls;
   1.172 +
   1.173 +
   1.174 +  // Table 2
   1.175 +  YYYY(kJISx4051Cls_1 , 0x0028),
   1.176 +  YYYY(kJISx4051Cls_1 , 0x005B),
   1.177 +  YYYY(kJISx4051Cls_1 , 0x007B),
   1.178 +  YYYY(kJISx4051Cls_1 , 0x2018),
   1.179 +  YYYY(kJISx4051Cls_1 , 0x201B),
   1.180 +  YYYY(kJISx4051Cls_1 , 0x201C),
   1.181 +  YYYY(kJISx4051Cls_1 , 0x201F),
   1.182 +  YYYY(kJISx4051Cls_1 , 0x3008),
   1.183 +  YYYY(kJISx4051Cls_1 , 0x300A),
   1.184 +  YYYY(kJISx4051Cls_1 , 0x300C),
   1.185 +  YYYY(kJISx4051Cls_1 , 0x300E),
   1.186 +  YYYY(kJISx4051Cls_1 , 0x3010),
   1.187 +  YYYY(kJISx4051Cls_1 , 0x3014),
   1.188 +  YYYY(kJISx4051Cls_1 , 0x3016),
   1.189 +  YYYY(kJISx4051Cls_1 , 0x3018),
   1.190 +  YYYY(kJISx4051Cls_1 , 0x301A),
   1.191 +  YYYY(kJISx4051Cls_1 , 0x301D),
   1.192 +
   1.193 +  // Table 3
   1.194 +  YYYY(kJISx4051Cls_2 , 0x0029),
   1.195 +  YYYY(kJISx4051Cls_2 , 0x002C),
   1.196 +  YYYY(kJISx4051Cls_2 , 0x005D),
   1.197 +  YYYY(kJISx4051Cls_2 , 0x007D),
   1.198 +  YYYY(kJISx4051Cls_2 , 0x2019),
   1.199 +  YYYY(kJISx4051Cls_2 , 0x201A),
   1.200 +  YYYY(kJISx4051Cls_2 , 0x201D),
   1.201 +  YYYY(kJISx4051Cls_2 , 0x201E),
   1.202 +  YYYY(kJISx4051Cls_2 , 0x3001),
   1.203 +  YYYY(kJISx4051Cls_2 , 0x3009),
   1.204 +  YYYY(kJISx4051Cls_2 , 0x300B),
   1.205 +  YYYY(kJISx4051Cls_2 , 0x300D),
   1.206 +  YYYY(kJISx4051Cls_2 , 0x300F),
   1.207 +  YYYY(kJISx4051Cls_2 , 0x3011),
   1.208 +  YYYY(kJISx4051Cls_2 , 0x3015),
   1.209 +  YYYY(kJISx4051Cls_2 , 0x3017),
   1.210 +  YYYY(kJISx4051Cls_2 , 0x3019),
   1.211 +  YYYY(kJISx4051Cls_2 , 0x301B),
   1.212 +  YYYY(kJISx4051Cls_2 , 0x301E),
   1.213 +  YYYY(kJISx4051Cls_2 , 0x301F),
   1.214 +
   1.215 +  // Table 4
   1.216 +  YYYY(kJISx4051Cls_3 , 0x203C),
   1.217 +  YYYY(kJISx4051Cls_3 , 0x2044),
   1.218 +  YYYY(kJISx4051Cls_3 , 0x301C),
   1.219 +  YYYY(kJISx4051Cls_3 , 0x3041),
   1.220 +  YYYY(kJISx4051Cls_3 , 0x3043),
   1.221 +  YYYY(kJISx4051Cls_3 , 0x3045),
   1.222 +  YYYY(kJISx4051Cls_3 , 0x3047),
   1.223 +  YYYY(kJISx4051Cls_3 , 0x3049),
   1.224 +  YYYY(kJISx4051Cls_3 , 0x3063),
   1.225 +  YYYY(kJISx4051Cls_3 , 0x3083),
   1.226 +  YYYY(kJISx4051Cls_3 , 0x3085),
   1.227 +  YYYY(kJISx4051Cls_3 , 0x3087),
   1.228 +  YYYY(kJISx4051Cls_3 , 0x308E),
   1.229 +  YYYY(kJISx4051Cls_3 , 0x309D),
   1.230 +  YYYY(kJISx4051Cls_3 , 0x309E),
   1.231 +  YYYY(kJISx4051Cls_3 , 0x30A1),
   1.232 +  YYYY(kJISx4051Cls_3 , 0x30A3),
   1.233 +  YYYY(kJISx4051Cls_3 , 0x30A5),
   1.234 +  YYYY(kJISx4051Cls_3 , 0x30A7),
   1.235 +  YYYY(kJISx4051Cls_3 , 0x30A9),
   1.236 +  YYYY(kJISx4051Cls_3 , 0x30C3),
   1.237 +  YYYY(kJISx4051Cls_3 , 0x30E3),
   1.238 +  YYYY(kJISx4051Cls_3 , 0x30E5),
   1.239 +  YYYY(kJISx4051Cls_3 , 0x30E7),
   1.240 +  YYYY(kJISx4051Cls_3 , 0x30EE),
   1.241 +  YYYY(kJISx4051Cls_3 , 0x30F5),
   1.242 +  YYYY(kJISx4051Cls_3 , 0x30F6),
   1.243 +  YYYY(kJISx4051Cls_3 , 0x30FC),
   1.244 +  YYYY(kJISx4051Cls_3 , 0x30FD),
   1.245 +  YYYY(kJISx4051Cls_3 , 0x30FE),
   1.246 +
   1.247 +  // Table 5
   1.248 +  YYYY(kJISx4051Cls_4 , 0x0021),
   1.249 +  YYYY(kJISx4051Cls_4 , 0x003F),
   1.250 +   
   1.251 +  // Table 6
   1.252 +  YYYY(kJISx4051Cls_5 , 0x003A),
   1.253 +  YYYY(kJISx4051Cls_5 , 0x003B),
   1.254 +  YYYY(kJISx4051Cls_5 , 0x30FB),
   1.255 +
   1.256 +  // Table 7
   1.257 +  YYYY(kJISx4051Cls_6 , 0x002E),
   1.258 +  YYYY(kJISx4051Cls_6 , 0x3002),
   1.259 +
   1.260 +  // Table 8
   1.261 +  YYYY(kJISx4051Cls_7 , 0x2014),
   1.262 +  YYYY(kJISx4051Cls_7 , 0x2024),
   1.263 +  YYYY(kJISx4051Cls_7 , 0x2025),
   1.264 +  YYYY(kJISx4051Cls_7 , 0x2026),
   1.265 +
   1.266 +  // Table 9
   1.267 +  YYYY(kJISx4051Cls_8 , 0x0024),
   1.268 +  YYYY(kJISx4051Cls_8 , 0x00A3),
   1.269 +  YYYY(kJISx4051Cls_8 , 0x00A5),
   1.270 +  YYYY(kJISx4051Cls_8 , 0x2116),
   1.271 +
   1.272 +  // Table 10
   1.273 +  YYYY(kJISx4051Cls_9 , 0x0025),
   1.274 +  YYYY(kJISx4051Cls_9 , 0x00A2),
   1.275 +  YYYY(kJISx4051Cls_9 , 0x00B0),
   1.276 +  YYYY(kJISx4051Cls_9 , 0x2030),
   1.277 +  YYYY(kJISx4051Cls_9 , 0x2031),
   1.278 +  YYYY(kJISx4051Cls_9 , 0x2032),
   1.279 +  YYYY(kJISx4051Cls_9 , 0x2033),
   1.280 +
   1.281 +  // Table 1
   1.282 +  YYYY(kJISx4051Cls_10, 0x3000),
   1.283 +
   1.284 +  // Table 1
   1.285 +  ZZZZ(kJISx4051Cls_11, 0x3000),
   1.286 +
   1.287 +
   1.288 +
   1.289 +

mercurial