intl/icu/source/extra/uconv/uconv.1.in

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/extra/uconv/uconv.1.in	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,443 @@
     1.4 +.\" Hey, Emacs! This is -*-nroff-*- you know...
     1.5 +.\"
     1.6 +.\" uconv.1: manual page for the uconv utility.
     1.7 +.\"
     1.8 +.\" Copyright (C) 2000-2013 IBM, Inc. and others.
     1.9 +.\"
    1.10 +.\" Manual page by Yves Arrouye <yves@realnames.com>.
    1.11 +.\"
    1.12 +.TH UCONV 1 "2005-jul-1" "ICU MANPAGE" "ICU @VERSION@ Manual"
    1.13 +.SH NAME
    1.14 +.B uconv
    1.15 +\- convert data from one encoding to another
    1.16 +.SH SYNOPSIS
    1.17 +.B uconv
    1.18 +[
    1.19 +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
    1.20 +]
    1.21 +[
    1.22 +.BI "\-V\fP, \fB\-\-version"
    1.23 +]
    1.24 +[
    1.25 +.BI "\-s\fP, \fB\-\-silent"
    1.26 +]
    1.27 +[
    1.28 +.BI "\-v\fP, \fB\-\-verbose"
    1.29 +]
    1.30 +[
    1.31 +.BI "\-l\fP, \fB\-\-list"
    1.32 +|
    1.33 +.BI "\-l\fP, \fB\-\-list\-code" " code"
    1.34 +|
    1.35 +.BI "\-\-default-code"
    1.36 +|
    1.37 +.BI "\-L\fP, \fB\-\-list\-transliterators"
    1.38 +]
    1.39 +[
    1.40 +.BI "\-\-canon"
    1.41 +]
    1.42 +[
    1.43 +.BI "\-x" " transliteration
    1.44 +]
    1.45 +[
    1.46 +.BI "\-\-to\-callback" " callback"
    1.47 +|
    1.48 +.B "\-c"
    1.49 +]
    1.50 +[
    1.51 +.BI "\-\-from\-callback" " callback"
    1.52 +|
    1.53 +.B "\-i"
    1.54 +]
    1.55 +[
    1.56 +.BI "\-\-callback" " callback"
    1.57 +]
    1.58 +[
    1.59 +.BI "\-\-fallback"
    1.60 +|
    1.61 +.BI "\-\-no\-fallback"
    1.62 +]
    1.63 +[
    1.64 +.BI "\-b\fP, \fB\-\-block\-size" " size"
    1.65 +]
    1.66 +[
    1.67 +.BI "\-f\fP, \fB\-\-from\-code" " encoding"
    1.68 +]
    1.69 +[
    1.70 +.BI "\-t\fP, \fB\-\-to\-code" " encoding"
    1.71 +]
    1.72 +[
    1.73 +.BI "\-\-add\-signature"
    1.74 +]
    1.75 +[
    1.76 +.BI "\-\-remove\-signature"
    1.77 +]
    1.78 +[
    1.79 +.BI "\-o\fP, \fB\-\-output" " file"
    1.80 +]
    1.81 +[
    1.82 +.IR file .\|.\|.
    1.83 +]
    1.84 +.SH DESCRIPTION
    1.85 +.B uconv
    1.86 +converts, or transcodes, each given
    1.87 +.I file
    1.88 +(or its standard input if no
    1.89 +.I file
    1.90 +is specified) from one
    1.91 +.I encoding
    1.92 +to another. 
    1.93 +The transcoding is done using Unicode as a pivot encoding
    1.94 +(i.e. the data are first transcoded from their original encoding to
    1.95 +Unicode, and then from Unicode to the destination encoding).
    1.96 +.PP
    1.97 +If an
    1.98 +.I encoding
    1.99 +is not specified or is
   1.100 +.BR - ,
   1.101 +the default encoding is used. Thus, calling
   1.102 +.B uconv
   1.103 +with no
   1.104 +.I encoding
   1.105 +provides an easy way to validate and sanitize data files for
   1.106 +further consumption by tools requiring data in the default encoding.
   1.107 +.PP
   1.108 +When calling
   1.109 +.BR uconv ,
   1.110 +it is possible to specify callbacks that are used to handle invalid
   1.111 +characters in the input, or characters that cannot be transcoded to
   1.112 +the destination encoding. Some encodings, for example, offer a default
   1.113 +substitution character that can be used to represent the occurence of
   1.114 +such characters in the input. Other callbacks offer a useful visual
   1.115 +representation of the invalid data.
   1.116 +.PP
   1.117 +.B uconv
   1.118 +can also run the specified
   1.119 +.IR transliteration
   1.120 +on the transcoded data,
   1.121 +in which case transliteration will happen as an intermediate step,
   1.122 +after the data have been transcoded to Unicode.
   1.123 +The
   1.124 +.I transliteration
   1.125 +can be either a list of semicolon-separated transliterator names,
   1.126 +or an arbitrarily complex set of rules in the ICU transliteration
   1.127 +rules format.
   1.128 +.PP
   1.129 +For transcoding purposes,
   1.130 +.B uconv
   1.131 +options are compatible with those of
   1.132 +.BR iconv (1),
   1.133 +making it easy to replace it in scripts. It is not necessarily the case,
   1.134 +however, that the encoding names used by
   1.135 +.B uconv
   1.136 +and ICU are the same as the ones used by
   1.137 +.BR iconv (1).
   1.138 +Also, options that provide informational data, such as the
   1.139 +.B \-l\fP, \fB\-\-list
   1.140 +one offered by some 
   1.141 +.BR iconv (1)
   1.142 +variants such as GNU's, produce data in a slightly different and
   1.143 +easier to parse format.
   1.144 +.SH OPTIONS
   1.145 +.TP
   1.146 +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
   1.147 +Print help about usage and exit.
   1.148 +.TP
   1.149 +.BR "\-V\fP, \fB\-\-version"
   1.150 +Print the version of
   1.151 +.B uconv
   1.152 +and exit.
   1.153 +.TP
   1.154 +.BI "\-s\fP, \fB\-\-silent"
   1.155 +Suppress messages during execution.
   1.156 +.TP
   1.157 +.BI "\-v\fP, \fB\-\-verbose"
   1.158 +Display extra informative messages during execution.
   1.159 +.TP
   1.160 +.BI "\-l\fP, \fB\-\-list"
   1.161 +List all the available encodings and exit.
   1.162 +.TP
   1.163 +.BI "\-l\fP, \fB\-\-list\-code" " code"
   1.164 +List only the
   1.165 +.I code
   1.166 +encoding and exit. If
   1.167 +.I code
   1.168 +is not a proper encoding, exit with an error.
   1.169 +.TP
   1.170 +.BI "\-\-default-code"
   1.171 +List only the name of the default encoding and exit.
   1.172 +.TP
   1.173 +.BI "\-L\fP, \fB\-\-list\-transliterators"
   1.174 +List all the available transliterators and exit.
   1.175 +.TP
   1.176 +.BI "\--canon"
   1.177 +If used with
   1.178 +.BI "\-l\fP, \fB\-\-list"
   1.179 +or
   1.180 +.BR "\-\-default-code" ,
   1.181 +the list of encodings is produced in a format compatible with
   1.182 +.BR convrtrs.txt (5).
   1.183 +If used with
   1.184 +.BR "\-L\fP, \fB\-\-list\-transliterators" ,
   1.185 +print only one transliterator name per line.
   1.186 +.TP
   1.187 +.BI "\-x" " transliteration"
   1.188 +Run the given
   1.189 +.IR transliteration
   1.190 +on the transcoded Unicode data,
   1.191 +and use the transliterated data as input for the transcoding to
   1.192 +the the destination encoding.
   1.193 +.TP
   1.194 +.BI "\-\-to\-callback" " callback"
   1.195 +Use
   1.196 +.I callback
   1.197 +to handle characters that cannot be transcoded to the destination
   1.198 +encoding. See section
   1.199 +.B CALLBACKS
   1.200 +for details on valid callbacks.
   1.201 +.TP
   1.202 +.B "\-c"
   1.203 +Omit invalid characters from the output.
   1.204 +Same as
   1.205 +.BR "\-\-to\-callback skip" .
   1.206 +.TP
   1.207 +.BI "\-\-from\-callback" " callback"
   1.208 +Use
   1.209 +.I callback
   1.210 +to handle characters that cannot be transcoded from the original
   1.211 +encoding. See section
   1.212 +.B CALLBACKS
   1.213 +for details on valid callbacks.
   1.214 +.TP
   1.215 +.B "\-i"
   1.216 +Ignore invalid sequences in the input.
   1.217 +Same as
   1.218 +.BR "\-\-from\-callback skip" .
   1.219 +.TP
   1.220 +.BI "\-\-callback" " callback"
   1.221 +Use
   1.222 +.I callback
   1.223 +to handle both characters that cannot be transcoded from the original
   1.224 +encoding and characters that cannot be transcoded to the destination
   1.225 +encoding. See section
   1.226 +.B CALLBACKS
   1.227 +for details on valid callbacks.
   1.228 +.TP
   1.229 +.BI "\-\-fallback"
   1.230 +Use the fallback mapping when transcoding from
   1.231 +Unicode to the destination encoding.
   1.232 +.TP
   1.233 +.BI "\-\-no\-fallback"
   1.234 +Do not use the fallback mapping when transcoding from Unicode to the
   1.235 +destination encoding.
   1.236 +This is the default.
   1.237 +.TP
   1.238 +.BI "\-b\fP, \fB\-\-block\-size" " size"
   1.239 +Read input in blocks of
   1.240 +.I size
   1.241 +bytes at a time. The default block size is
   1.242 +4096.
   1.243 +.TP
   1.244 +.BI "\-f\fP, \fB\-\-from\-code" " encoding"
   1.245 +Set the original encoding of the data to 
   1.246 +.IR encoding .
   1.247 +.TP
   1.248 +.BI "\-t\fP, \fB\-\-to\-code" " encoding"
   1.249 +Transcode the data to
   1.250 +.IR encoding .
   1.251 +.TP
   1.252 +.BI "\-\-add\-signature"
   1.253 +Add a U+FEFF Unicode signature character (BOM) if the output charset
   1.254 +supports it and does not add one anyway.
   1.255 +.TP
   1.256 +.BI "\-\-remove\-signature"
   1.257 +Remove a U+FEFF Unicode signature character (BOM).
   1.258 +.TP
   1.259 +.BI "\-o\fP, \fB\-\-output" " file"
   1.260 +Write the transcoded data to
   1.261 +.IR file .
   1.262 +.SH CALLBACKS
   1.263 +.B uconv
   1.264 +supports specifying callbacks to handle invalid data. Callbacks can be
   1.265 +set for both directions of transcoding: from the original encoding to
   1.266 +Unicode, with the
   1.267 +.BR "\-\-from\-callback"
   1.268 +option, and from Unicode to the destination encoding, with the
   1.269 +.BR "\-\-to\-callback"
   1.270 +option.
   1.271 +.PP
   1.272 +The following is a list of valid
   1.273 +.I callback
   1.274 +names, along with a description of their behavior. The list of
   1.275 +callbacks actually supported by
   1.276 +.B uconv
   1.277 +is displayed when it is called with
   1.278 +.BR "\-h\fP, \fB\-\-help" .
   1.279 +.PP
   1.280 +.TP \w'\fBescape-unicode'u+3n
   1.281 +.B substitute
   1.282 +Write the the encoding's substitute sequence, or the Unicode
   1.283 +replacement character
   1.284 +.B U+FFFD
   1.285 +when transcoding to Unicode.
   1.286 +.TP
   1.287 +.B skip
   1.288 +Ignore the invalid data.
   1.289 +.TP
   1.290 +.B stop
   1.291 +Stop with an error when encountering invalid data.
   1.292 +This is the default callback.
   1.293 +.TP
   1.294 +.B escape
   1.295 +Same as
   1.296 +.BR escape-icu .
   1.297 +.TP
   1.298 +.B escape-icu
   1.299 +Replace the missing characters with a string of the format
   1.300 +.BR %U\fIhhhh\fP
   1.301 +for plane 0 characters, and
   1.302 +.BR %U\fIhhhh\fP%U\fIhhhh\fP
   1.303 +for planes 1 and above characters,
   1.304 +where
   1.305 +.I hhhh
   1.306 +is the hexadecimal value of one of the UTF-16 code units representing the
   1.307 +character. Characters from planes 1 and above are written as a pair of
   1.308 +UTF-16 surrogate code units.
   1.309 +.TP
   1.310 +.B escape-java
   1.311 +Replace the missing characters with a string of the format
   1.312 +.BR \eu\fIhhhh\fP
   1.313 +for plane 0 characters, and
   1.314 +.BR \eu\fIhhhh\fP\eu\fIhhhh\fP
   1.315 +for planes 1 and above characters,
   1.316 +where
   1.317 +.I hhhh
   1.318 +is the hexadecimal value of one of the UTF-16 code units representing the
   1.319 +character. Characters from planes 1 and above are written as a pair of
   1.320 +UTF-16 surrogate code units.
   1.321 +.TP
   1.322 +.B escape-c
   1.323 +Replace the missing characters with a string of the format
   1.324 +.BR \eu\fIhhhh\fP
   1.325 +for plane 0 characters, and
   1.326 +.BR \eU\fIhhhhhhhh\fP
   1.327 +for planes 1 and above characters,
   1.328 +where
   1.329 +.I hhhh
   1.330 +and
   1.331 +.I hhhhhhhh
   1.332 +are the hexadecimal values of the Unicode codepoint.
   1.333 +.TP
   1.334 +.B escape-xml
   1.335 +Same as
   1.336 +.BR escape-xml-hex .
   1.337 +.TP
   1.338 +.B escape-xml-hex
   1.339 +Replace the missing characters with a string of the format
   1.340 +.BR &#x\fIhhhh\fP; ,
   1.341 +where
   1.342 +.I hhhh
   1.343 +is the hexadecimal value of the Unicode codepoint.
   1.344 +.TP
   1.345 +.B escape-xml-dec
   1.346 +Replace the missing characters with a string of the format
   1.347 +.BR &#\fInnnn\fP; ,
   1.348 +where
   1.349 +.I nnnn
   1.350 +is the decimal value of the Unicode codepoint.
   1.351 +.TP
   1.352 +.B escape-unicode
   1.353 +Replace the missing characters with a string of the format
   1.354 +.BR {U+\fIhhhh\fP} ,
   1.355 +where
   1.356 +.I hhhh
   1.357 +is the hexadecimal value of the Unicode codepoint.
   1.358 +That hexadecimal string is of variable length and can use from 4 to
   1.359 +6 digits.
   1.360 +This is the format universally used to denote a Unicode codepoint in
   1.361 +the litterature, delimited by curly braces for easy recognition of those
   1.362 +substitutions in the output.
   1.363 +.SH EXAMPLES
   1.364 +Convert data from a given
   1.365 +.I encoding
   1.366 +to the platform encoding:
   1.367 +
   1.368 +.RS 4
   1.369 +.B \fR$ \fPuconv \-f \fIencoding\fP
   1.370 +.RE
   1.371 +.PP
   1.372 +Check if a
   1.373 +.I file
   1.374 +contains valid data for a given
   1.375 +.IR encoding :
   1.376 +
   1.377 +.RS 4
   1.378 +.B \fR$ \fPuconv \-f \fIencoding\fP \-c \fIfile\fP >/dev/null
   1.379 +.RE
   1.380 +.PP
   1.381 +Convert a UTF-8
   1.382 +.I file
   1.383 +to a given
   1.384 +.I encoding
   1.385 +and ensure that the resulting text is good for any version of HTML:
   1.386 +
   1.387 +.RS 4
   1.388 +.B \fR$ \fPuconv \-f utf-8 \-t \fIencoding\fP \e
   1.389 +.br
   1.390 +.B "    \-\-callback escape-xml-dec \fIfile\fP"
   1.391 +.RE
   1.392 +.PP
   1.393 +Display the names of the Unicode code points in a UTF-file:
   1.394 +
   1.395 +.RS 4
   1.396 +.B \fR$ \fPuconv \-f utf-8 \-x any-name \fIfile\fP
   1.397 +.RE
   1.398 +.PP
   1.399 +Print the name of a Unicode code point whose value is known (\fBU+30AB\fP
   1.400 +in this example):
   1.401 +
   1.402 +.RS 4
   1.403 +.B \fR$ \fPecho '\eu30ab' | uconv \-x 'hex-any; any-name'; echo
   1.404 +.br
   1.405 +{KATAKANA LETTER KA}{LINE FEED}
   1.406 +.br
   1.407 +$ 
   1.408 +.RE
   1.409 +
   1.410 +(The names are delimited by curly braces.
   1.411 +Also, the name of the line terminator is also displayed.)
   1.412 +.PP
   1.413 +Normalize UTF-8 data using Unicode NFKC, remove all control characters,
   1.414 +and map Katakana to Hiragana:
   1.415 +
   1.416 +.RS 4
   1.417 +.B \fR$ \fPuconv \-f utf-8 \-t utf-8 \e
   1.418 +.br
   1.419 +.B "      \-x '::nfkc; [:Cc:] >; ::katakana-hiragana;'"
   1.420 +.SH CAVEATS AND BUGS
   1.421 +.B uconv
   1.422 +does report errors as occuring at the first invalid byte
   1.423 +encountered. This may be confusing to users of GNU
   1.424 +.BR iconv (1),
   1.425 +which reports errors as occuring at the first byte of an invalid
   1.426 +sequence. For multi-byte character sets or encodings, this means that
   1.427 +.BR uconv
   1.428 +error positions may be at a later offset in the input stream than
   1.429 +would be the case with GNU
   1.430 +.BR iconv (1).
   1.431 +.PP
   1.432 +The reporting of error positions when a transliterator is used may be
   1.433 +inaccurate or unavailable, in which case
   1.434 +.BR uconv
   1.435 +will report the offset in the output stream at which the error
   1.436 +occured.
   1.437 +.SH AUTHORS
   1.438 +Jonas Utterstroem
   1.439 +.br
   1.440 +Yves Arrouye
   1.441 +.SH VERSION
   1.442 +@VERSION@
   1.443 +.SH COPYRIGHT
   1.444 +Copyright (C) 2000-2005 IBM, Inc. and others.
   1.445 +.SH SEE ALSO
   1.446 +.BR iconv (1)

mercurial