intl/icu/source/extra/uconv/uconv.1.in

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 .\" Hey, Emacs! This is -*-nroff-*- you know...
michael@0 2 .\"
michael@0 3 .\" uconv.1: manual page for the uconv utility.
michael@0 4 .\"
michael@0 5 .\" Copyright (C) 2000-2013 IBM, Inc. and others.
michael@0 6 .\"
michael@0 7 .\" Manual page by Yves Arrouye <yves@realnames.com>.
michael@0 8 .\"
michael@0 9 .TH UCONV 1 "2005-jul-1" "ICU MANPAGE" "ICU @VERSION@ Manual"
michael@0 10 .SH NAME
michael@0 11 .B uconv
michael@0 12 \- convert data from one encoding to another
michael@0 13 .SH SYNOPSIS
michael@0 14 .B uconv
michael@0 15 [
michael@0 16 .BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
michael@0 17 ]
michael@0 18 [
michael@0 19 .BI "\-V\fP, \fB\-\-version"
michael@0 20 ]
michael@0 21 [
michael@0 22 .BI "\-s\fP, \fB\-\-silent"
michael@0 23 ]
michael@0 24 [
michael@0 25 .BI "\-v\fP, \fB\-\-verbose"
michael@0 26 ]
michael@0 27 [
michael@0 28 .BI "\-l\fP, \fB\-\-list"
michael@0 29 |
michael@0 30 .BI "\-l\fP, \fB\-\-list\-code" " code"
michael@0 31 |
michael@0 32 .BI "\-\-default-code"
michael@0 33 |
michael@0 34 .BI "\-L\fP, \fB\-\-list\-transliterators"
michael@0 35 ]
michael@0 36 [
michael@0 37 .BI "\-\-canon"
michael@0 38 ]
michael@0 39 [
michael@0 40 .BI "\-x" " transliteration
michael@0 41 ]
michael@0 42 [
michael@0 43 .BI "\-\-to\-callback" " callback"
michael@0 44 |
michael@0 45 .B "\-c"
michael@0 46 ]
michael@0 47 [
michael@0 48 .BI "\-\-from\-callback" " callback"
michael@0 49 |
michael@0 50 .B "\-i"
michael@0 51 ]
michael@0 52 [
michael@0 53 .BI "\-\-callback" " callback"
michael@0 54 ]
michael@0 55 [
michael@0 56 .BI "\-\-fallback"
michael@0 57 |
michael@0 58 .BI "\-\-no\-fallback"
michael@0 59 ]
michael@0 60 [
michael@0 61 .BI "\-b\fP, \fB\-\-block\-size" " size"
michael@0 62 ]
michael@0 63 [
michael@0 64 .BI "\-f\fP, \fB\-\-from\-code" " encoding"
michael@0 65 ]
michael@0 66 [
michael@0 67 .BI "\-t\fP, \fB\-\-to\-code" " encoding"
michael@0 68 ]
michael@0 69 [
michael@0 70 .BI "\-\-add\-signature"
michael@0 71 ]
michael@0 72 [
michael@0 73 .BI "\-\-remove\-signature"
michael@0 74 ]
michael@0 75 [
michael@0 76 .BI "\-o\fP, \fB\-\-output" " file"
michael@0 77 ]
michael@0 78 [
michael@0 79 .IR file .\|.\|.
michael@0 80 ]
michael@0 81 .SH DESCRIPTION
michael@0 82 .B uconv
michael@0 83 converts, or transcodes, each given
michael@0 84 .I file
michael@0 85 (or its standard input if no
michael@0 86 .I file
michael@0 87 is specified) from one
michael@0 88 .I encoding
michael@0 89 to another.
michael@0 90 The transcoding is done using Unicode as a pivot encoding
michael@0 91 (i.e. the data are first transcoded from their original encoding to
michael@0 92 Unicode, and then from Unicode to the destination encoding).
michael@0 93 .PP
michael@0 94 If an
michael@0 95 .I encoding
michael@0 96 is not specified or is
michael@0 97 .BR - ,
michael@0 98 the default encoding is used. Thus, calling
michael@0 99 .B uconv
michael@0 100 with no
michael@0 101 .I encoding
michael@0 102 provides an easy way to validate and sanitize data files for
michael@0 103 further consumption by tools requiring data in the default encoding.
michael@0 104 .PP
michael@0 105 When calling
michael@0 106 .BR uconv ,
michael@0 107 it is possible to specify callbacks that are used to handle invalid
michael@0 108 characters in the input, or characters that cannot be transcoded to
michael@0 109 the destination encoding. Some encodings, for example, offer a default
michael@0 110 substitution character that can be used to represent the occurence of
michael@0 111 such characters in the input. Other callbacks offer a useful visual
michael@0 112 representation of the invalid data.
michael@0 113 .PP
michael@0 114 .B uconv
michael@0 115 can also run the specified
michael@0 116 .IR transliteration
michael@0 117 on the transcoded data,
michael@0 118 in which case transliteration will happen as an intermediate step,
michael@0 119 after the data have been transcoded to Unicode.
michael@0 120 The
michael@0 121 .I transliteration
michael@0 122 can be either a list of semicolon-separated transliterator names,
michael@0 123 or an arbitrarily complex set of rules in the ICU transliteration
michael@0 124 rules format.
michael@0 125 .PP
michael@0 126 For transcoding purposes,
michael@0 127 .B uconv
michael@0 128 options are compatible with those of
michael@0 129 .BR iconv (1),
michael@0 130 making it easy to replace it in scripts. It is not necessarily the case,
michael@0 131 however, that the encoding names used by
michael@0 132 .B uconv
michael@0 133 and ICU are the same as the ones used by
michael@0 134 .BR iconv (1).
michael@0 135 Also, options that provide informational data, such as the
michael@0 136 .B \-l\fP, \fB\-\-list
michael@0 137 one offered by some
michael@0 138 .BR iconv (1)
michael@0 139 variants such as GNU's, produce data in a slightly different and
michael@0 140 easier to parse format.
michael@0 141 .SH OPTIONS
michael@0 142 .TP
michael@0 143 .BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
michael@0 144 Print help about usage and exit.
michael@0 145 .TP
michael@0 146 .BR "\-V\fP, \fB\-\-version"
michael@0 147 Print the version of
michael@0 148 .B uconv
michael@0 149 and exit.
michael@0 150 .TP
michael@0 151 .BI "\-s\fP, \fB\-\-silent"
michael@0 152 Suppress messages during execution.
michael@0 153 .TP
michael@0 154 .BI "\-v\fP, \fB\-\-verbose"
michael@0 155 Display extra informative messages during execution.
michael@0 156 .TP
michael@0 157 .BI "\-l\fP, \fB\-\-list"
michael@0 158 List all the available encodings and exit.
michael@0 159 .TP
michael@0 160 .BI "\-l\fP, \fB\-\-list\-code" " code"
michael@0 161 List only the
michael@0 162 .I code
michael@0 163 encoding and exit. If
michael@0 164 .I code
michael@0 165 is not a proper encoding, exit with an error.
michael@0 166 .TP
michael@0 167 .BI "\-\-default-code"
michael@0 168 List only the name of the default encoding and exit.
michael@0 169 .TP
michael@0 170 .BI "\-L\fP, \fB\-\-list\-transliterators"
michael@0 171 List all the available transliterators and exit.
michael@0 172 .TP
michael@0 173 .BI "\--canon"
michael@0 174 If used with
michael@0 175 .BI "\-l\fP, \fB\-\-list"
michael@0 176 or
michael@0 177 .BR "\-\-default-code" ,
michael@0 178 the list of encodings is produced in a format compatible with
michael@0 179 .BR convrtrs.txt (5).
michael@0 180 If used with
michael@0 181 .BR "\-L\fP, \fB\-\-list\-transliterators" ,
michael@0 182 print only one transliterator name per line.
michael@0 183 .TP
michael@0 184 .BI "\-x" " transliteration"
michael@0 185 Run the given
michael@0 186 .IR transliteration
michael@0 187 on the transcoded Unicode data,
michael@0 188 and use the transliterated data as input for the transcoding to
michael@0 189 the the destination encoding.
michael@0 190 .TP
michael@0 191 .BI "\-\-to\-callback" " callback"
michael@0 192 Use
michael@0 193 .I callback
michael@0 194 to handle characters that cannot be transcoded to the destination
michael@0 195 encoding. See section
michael@0 196 .B CALLBACKS
michael@0 197 for details on valid callbacks.
michael@0 198 .TP
michael@0 199 .B "\-c"
michael@0 200 Omit invalid characters from the output.
michael@0 201 Same as
michael@0 202 .BR "\-\-to\-callback skip" .
michael@0 203 .TP
michael@0 204 .BI "\-\-from\-callback" " callback"
michael@0 205 Use
michael@0 206 .I callback
michael@0 207 to handle characters that cannot be transcoded from the original
michael@0 208 encoding. See section
michael@0 209 .B CALLBACKS
michael@0 210 for details on valid callbacks.
michael@0 211 .TP
michael@0 212 .B "\-i"
michael@0 213 Ignore invalid sequences in the input.
michael@0 214 Same as
michael@0 215 .BR "\-\-from\-callback skip" .
michael@0 216 .TP
michael@0 217 .BI "\-\-callback" " callback"
michael@0 218 Use
michael@0 219 .I callback
michael@0 220 to handle both characters that cannot be transcoded from the original
michael@0 221 encoding and characters that cannot be transcoded to the destination
michael@0 222 encoding. See section
michael@0 223 .B CALLBACKS
michael@0 224 for details on valid callbacks.
michael@0 225 .TP
michael@0 226 .BI "\-\-fallback"
michael@0 227 Use the fallback mapping when transcoding from
michael@0 228 Unicode to the destination encoding.
michael@0 229 .TP
michael@0 230 .BI "\-\-no\-fallback"
michael@0 231 Do not use the fallback mapping when transcoding from Unicode to the
michael@0 232 destination encoding.
michael@0 233 This is the default.
michael@0 234 .TP
michael@0 235 .BI "\-b\fP, \fB\-\-block\-size" " size"
michael@0 236 Read input in blocks of
michael@0 237 .I size
michael@0 238 bytes at a time. The default block size is
michael@0 239 4096.
michael@0 240 .TP
michael@0 241 .BI "\-f\fP, \fB\-\-from\-code" " encoding"
michael@0 242 Set the original encoding of the data to
michael@0 243 .IR encoding .
michael@0 244 .TP
michael@0 245 .BI "\-t\fP, \fB\-\-to\-code" " encoding"
michael@0 246 Transcode the data to
michael@0 247 .IR encoding .
michael@0 248 .TP
michael@0 249 .BI "\-\-add\-signature"
michael@0 250 Add a U+FEFF Unicode signature character (BOM) if the output charset
michael@0 251 supports it and does not add one anyway.
michael@0 252 .TP
michael@0 253 .BI "\-\-remove\-signature"
michael@0 254 Remove a U+FEFF Unicode signature character (BOM).
michael@0 255 .TP
michael@0 256 .BI "\-o\fP, \fB\-\-output" " file"
michael@0 257 Write the transcoded data to
michael@0 258 .IR file .
michael@0 259 .SH CALLBACKS
michael@0 260 .B uconv
michael@0 261 supports specifying callbacks to handle invalid data. Callbacks can be
michael@0 262 set for both directions of transcoding: from the original encoding to
michael@0 263 Unicode, with the
michael@0 264 .BR "\-\-from\-callback"
michael@0 265 option, and from Unicode to the destination encoding, with the
michael@0 266 .BR "\-\-to\-callback"
michael@0 267 option.
michael@0 268 .PP
michael@0 269 The following is a list of valid
michael@0 270 .I callback
michael@0 271 names, along with a description of their behavior. The list of
michael@0 272 callbacks actually supported by
michael@0 273 .B uconv
michael@0 274 is displayed when it is called with
michael@0 275 .BR "\-h\fP, \fB\-\-help" .
michael@0 276 .PP
michael@0 277 .TP \w'\fBescape-unicode'u+3n
michael@0 278 .B substitute
michael@0 279 Write the the encoding's substitute sequence, or the Unicode
michael@0 280 replacement character
michael@0 281 .B U+FFFD
michael@0 282 when transcoding to Unicode.
michael@0 283 .TP
michael@0 284 .B skip
michael@0 285 Ignore the invalid data.
michael@0 286 .TP
michael@0 287 .B stop
michael@0 288 Stop with an error when encountering invalid data.
michael@0 289 This is the default callback.
michael@0 290 .TP
michael@0 291 .B escape
michael@0 292 Same as
michael@0 293 .BR escape-icu .
michael@0 294 .TP
michael@0 295 .B escape-icu
michael@0 296 Replace the missing characters with a string of the format
michael@0 297 .BR %U\fIhhhh\fP
michael@0 298 for plane 0 characters, and
michael@0 299 .BR %U\fIhhhh\fP%U\fIhhhh\fP
michael@0 300 for planes 1 and above characters,
michael@0 301 where
michael@0 302 .I hhhh
michael@0 303 is the hexadecimal value of one of the UTF-16 code units representing the
michael@0 304 character. Characters from planes 1 and above are written as a pair of
michael@0 305 UTF-16 surrogate code units.
michael@0 306 .TP
michael@0 307 .B escape-java
michael@0 308 Replace the missing characters with a string of the format
michael@0 309 .BR \eu\fIhhhh\fP
michael@0 310 for plane 0 characters, and
michael@0 311 .BR \eu\fIhhhh\fP\eu\fIhhhh\fP
michael@0 312 for planes 1 and above characters,
michael@0 313 where
michael@0 314 .I hhhh
michael@0 315 is the hexadecimal value of one of the UTF-16 code units representing the
michael@0 316 character. Characters from planes 1 and above are written as a pair of
michael@0 317 UTF-16 surrogate code units.
michael@0 318 .TP
michael@0 319 .B escape-c
michael@0 320 Replace the missing characters with a string of the format
michael@0 321 .BR \eu\fIhhhh\fP
michael@0 322 for plane 0 characters, and
michael@0 323 .BR \eU\fIhhhhhhhh\fP
michael@0 324 for planes 1 and above characters,
michael@0 325 where
michael@0 326 .I hhhh
michael@0 327 and
michael@0 328 .I hhhhhhhh
michael@0 329 are the hexadecimal values of the Unicode codepoint.
michael@0 330 .TP
michael@0 331 .B escape-xml
michael@0 332 Same as
michael@0 333 .BR escape-xml-hex .
michael@0 334 .TP
michael@0 335 .B escape-xml-hex
michael@0 336 Replace the missing characters with a string of the format
michael@0 337 .BR &#x\fIhhhh\fP; ,
michael@0 338 where
michael@0 339 .I hhhh
michael@0 340 is the hexadecimal value of the Unicode codepoint.
michael@0 341 .TP
michael@0 342 .B escape-xml-dec
michael@0 343 Replace the missing characters with a string of the format
michael@0 344 .BR &#\fInnnn\fP; ,
michael@0 345 where
michael@0 346 .I nnnn
michael@0 347 is the decimal value of the Unicode codepoint.
michael@0 348 .TP
michael@0 349 .B escape-unicode
michael@0 350 Replace the missing characters with a string of the format
michael@0 351 .BR {U+\fIhhhh\fP} ,
michael@0 352 where
michael@0 353 .I hhhh
michael@0 354 is the hexadecimal value of the Unicode codepoint.
michael@0 355 That hexadecimal string is of variable length and can use from 4 to
michael@0 356 6 digits.
michael@0 357 This is the format universally used to denote a Unicode codepoint in
michael@0 358 the litterature, delimited by curly braces for easy recognition of those
michael@0 359 substitutions in the output.
michael@0 360 .SH EXAMPLES
michael@0 361 Convert data from a given
michael@0 362 .I encoding
michael@0 363 to the platform encoding:
michael@0 364
michael@0 365 .RS 4
michael@0 366 .B \fR$ \fPuconv \-f \fIencoding\fP
michael@0 367 .RE
michael@0 368 .PP
michael@0 369 Check if a
michael@0 370 .I file
michael@0 371 contains valid data for a given
michael@0 372 .IR encoding :
michael@0 373
michael@0 374 .RS 4
michael@0 375 .B \fR$ \fPuconv \-f \fIencoding\fP \-c \fIfile\fP >/dev/null
michael@0 376 .RE
michael@0 377 .PP
michael@0 378 Convert a UTF-8
michael@0 379 .I file
michael@0 380 to a given
michael@0 381 .I encoding
michael@0 382 and ensure that the resulting text is good for any version of HTML:
michael@0 383
michael@0 384 .RS 4
michael@0 385 .B \fR$ \fPuconv \-f utf-8 \-t \fIencoding\fP \e
michael@0 386 .br
michael@0 387 .B " \-\-callback escape-xml-dec \fIfile\fP"
michael@0 388 .RE
michael@0 389 .PP
michael@0 390 Display the names of the Unicode code points in a UTF-file:
michael@0 391
michael@0 392 .RS 4
michael@0 393 .B \fR$ \fPuconv \-f utf-8 \-x any-name \fIfile\fP
michael@0 394 .RE
michael@0 395 .PP
michael@0 396 Print the name of a Unicode code point whose value is known (\fBU+30AB\fP
michael@0 397 in this example):
michael@0 398
michael@0 399 .RS 4
michael@0 400 .B \fR$ \fPecho '\eu30ab' | uconv \-x 'hex-any; any-name'; echo
michael@0 401 .br
michael@0 402 {KATAKANA LETTER KA}{LINE FEED}
michael@0 403 .br
michael@0 404 $
michael@0 405 .RE
michael@0 406
michael@0 407 (The names are delimited by curly braces.
michael@0 408 Also, the name of the line terminator is also displayed.)
michael@0 409 .PP
michael@0 410 Normalize UTF-8 data using Unicode NFKC, remove all control characters,
michael@0 411 and map Katakana to Hiragana:
michael@0 412
michael@0 413 .RS 4
michael@0 414 .B \fR$ \fPuconv \-f utf-8 \-t utf-8 \e
michael@0 415 .br
michael@0 416 .B " \-x '::nfkc; [:Cc:] >; ::katakana-hiragana;'"
michael@0 417 .SH CAVEATS AND BUGS
michael@0 418 .B uconv
michael@0 419 does report errors as occuring at the first invalid byte
michael@0 420 encountered. This may be confusing to users of GNU
michael@0 421 .BR iconv (1),
michael@0 422 which reports errors as occuring at the first byte of an invalid
michael@0 423 sequence. For multi-byte character sets or encodings, this means that
michael@0 424 .BR uconv
michael@0 425 error positions may be at a later offset in the input stream than
michael@0 426 would be the case with GNU
michael@0 427 .BR iconv (1).
michael@0 428 .PP
michael@0 429 The reporting of error positions when a transliterator is used may be
michael@0 430 inaccurate or unavailable, in which case
michael@0 431 .BR uconv
michael@0 432 will report the offset in the output stream at which the error
michael@0 433 occured.
michael@0 434 .SH AUTHORS
michael@0 435 Jonas Utterstroem
michael@0 436 .br
michael@0 437 Yves Arrouye
michael@0 438 .SH VERSION
michael@0 439 @VERSION@
michael@0 440 .SH COPYRIGHT
michael@0 441 Copyright (C) 2000-2005 IBM, Inc. and others.
michael@0 442 .SH SEE ALSO
michael@0 443 .BR iconv (1)

mercurial