xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/bin/piconv (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1#!./perl
2# $Id: piconv,v 2.6 2014/03/28 02:37:42 dankogai Exp $
3#
4BEGIN { pop @INC if $INC[-1] eq '.' }
5use 5.8.0;
6use strict;
7use Encode ;
8use Encode::Alias;
9my %Scheme =  map {$_ => 1} qw(from_to decode_encode perlio);
10
11use File::Basename;
12my $name = basename($0);
13
14use Getopt::Long qw(:config no_ignore_case);
15
16my %Opt;
17
18help()
19    unless
20      GetOptions(\%Opt,
21         'from|f=s',
22         'to|t=s',
23         'list|l',
24         'string|s=s',
25         'check|C=i',
26         'c',
27         'perlqq|p',
28         'htmlcref',
29         'xmlcref',
30         'debug|D',
31         'scheme|S=s',
32         'resolve|r=s',
33         'help',
34         );
35
36$Opt{help} and help();
37$Opt{list} and list_encodings();
38my $locale = $ENV{LC_CTYPE} || $ENV{LC_ALL} || $ENV{LANG};
39defined $Opt{resolve} and resolve_encoding($Opt{resolve});
40$Opt{from} || $Opt{to} || help();
41my $from = $Opt{from} || $locale or help("from_encoding unspecified");
42my $to   = $Opt{to}   || $locale or help("to_encoding unspecified");
43$Opt{string} and Encode::from_to($Opt{string}, $from, $to) and print $Opt{string} and exit;
44my $scheme = do {
45    if (defined $Opt{scheme}) {
46	if (!exists $Scheme{$Opt{scheme}}) {
47	    warn "Unknown scheme '$Opt{scheme}', fallback to 'from_to'.\n";
48	    'from_to';
49	} else {
50	    $Opt{scheme};
51	}
52    } else {
53	'from_to';
54    }
55};
56
57$Opt{check} ||= $Opt{c};
58$Opt{perlqq}   and $Opt{check} = Encode::PERLQQ;
59$Opt{htmlcref} and $Opt{check} = Encode::HTMLCREF;
60$Opt{xmlcref}  and $Opt{check} = Encode::XMLCREF;
61
62if ($Opt{debug}){
63    my $cfrom = Encode->getEncoding($from)->name;
64    my $cto   = Encode->getEncoding($to)->name;
65    print <<"EOT";
66Scheme: $scheme
67From:   $from => $cfrom
68To:     $to => $cto
69EOT
70}
71
72my %use_bom =
73  map { $_ => 1 } qw/UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE/;
74
75# we do not use <> (or ARGV) for the sake of binmode()
76@ARGV or push @ARGV, \*STDIN;
77
78unless ( $scheme eq 'perlio' ) {
79    binmode STDOUT;
80    my $need2slurp = $use_bom{ find_encoding($to)->name }
81      || $use_bom{ find_encoding($from)->name };
82    for my $argv (@ARGV) {
83        my $ifh = ref $argv ? $argv : undef;
84	$ifh or open $ifh, "<", $argv or warn "Can't open $argv: $!" and next;
85        $ifh or open $ifh, "<", $argv or next;
86        binmode $ifh;
87        if ( $scheme eq 'from_to' ) {    # default
88	    if ($need2slurp){
89		local $/;
90		$_ = <$ifh>;
91		Encode::from_to( $_, $from, $to, $Opt{check} );
92		print;
93	    }else{
94		while (<$ifh>) {
95		    Encode::from_to( $_, $from, $to, $Opt{check} );
96		    print;
97		}
98	    }
99        }
100        elsif ( $scheme eq 'decode_encode' ) {    # step-by-step
101	    if ($need2slurp){
102		local $/;
103		$_ = <$ifh>;
104                my $decoded = decode( $from, $_, $Opt{check} );
105                my $encoded = encode( $to, $decoded );
106                print $encoded;
107	    }else{
108		while (<$ifh>) {
109		    my $decoded = decode( $from, $_, $Opt{check} );
110		    my $encoded = encode( $to, $decoded );
111		    print $encoded;
112		}
113	    }
114	}
115	else {                                    # won't reach
116            die "$name: unknown scheme: $scheme";
117        }
118    }
119}
120else {
121
122    # NI-S favorite
123    binmode STDOUT => "raw:encoding($to)";
124    for my $argv (@ARGV) {
125        my $ifh = ref $argv ? $argv : undef;
126	$ifh or open $ifh, "<", $argv or warn "Can't open $argv: $!" and next;
127        $ifh or open $ifh, "<", $argv or next;
128        binmode $ifh => "raw:encoding($from)";
129        print while (<$ifh>);
130    }
131}
132
133sub list_encodings {
134    print join( "\n", Encode->encodings(":all") ), "\n";
135    exit 0;
136}
137
138sub resolve_encoding {
139    if ( my $alias = Encode::resolve_alias( $_[0] ) ) {
140        print $alias, "\n";
141        exit 0;
142    }
143    else {
144        warn "$name: $_[0] is not known to Encode\n";
145        exit 1;
146    }
147}
148
149sub help {
150    my $message = shift;
151    $message and print STDERR "$name error: $message\n";
152    print STDERR <<"EOT";
153$name [-f from_encoding] [-t to_encoding]
154      [-p|--perlqq|--htmlcref|--xmlcref] [-C N|-c] [-D] [-S scheme]
155      [-s string|file...]
156$name -l
157$name -r encoding_alias
158$name -h
159Common options:
160  -l,--list
161     lists all available encodings
162  -r,--resolve encoding_alias
163    resolve encoding to its (Encode) canonical name
164  -f,--from from_encoding
165     when omitted, the current locale will be used
166  -t,--to to_encoding
167     when omitted, the current locale will be used
168  -s,--string string
169     "string" will be the input instead of STDIN or files
170The following are mainly of interest to Encode hackers:
171  -C N | -c           check the validity of the input
172  -D,--debug          show debug information
173  -S,--scheme scheme  use the scheme for conversion
174Those are handy when you can only see ASCII characters:
175  -p,--perlqq         transliterate characters missing in encoding to \\x{HHHH}
176                      where HHHH is the hexadecimal Unicode code point
177  --htmlcref          transliterate characters missing in encoding to &#NNN;
178                      where NNN is the decimal Unicode code point
179  --xmlcref           transliterate characters missing in encoding to &#xHHHH;
180                      where HHHH is the hexadecimal Unicode code point
181
182EOT
183    exit;
184}
185
186__END__
187
188=head1 NAME
189
190piconv -- iconv(1), reinvented in perl
191
192=head1 SYNOPSIS
193
194  piconv [-f from_encoding] [-t to_encoding]
195         [-p|--perlqq|--htmlcref|--xmlcref] [-C N|-c] [-D] [-S scheme]
196         [-s string|file...]
197  piconv -l
198  piconv -r encoding_alias
199  piconv -h
200
201=head1 DESCRIPTION
202
203B<piconv> is perl version of B<iconv>, a character encoding converter
204widely available for various Unixen today.  This script was primarily
205a technology demonstrator for Perl 5.8.0, but you can use piconv in the
206place of iconv for virtually any case.
207
208piconv converts the character encoding of either STDIN or files
209specified in the argument and prints out to STDOUT.
210
211Here is the list of options.  Some options can be in short format (-f)
212or long (--from) one.
213
214=over 4
215
216=item -f,--from I<from_encoding>
217
218Specifies the encoding you are converting from.  Unlike B<iconv>,
219this option can be omitted.  In such cases, the current locale is used.
220
221=item -t,--to I<to_encoding>
222
223Specifies the encoding you are converting to.  Unlike B<iconv>,
224this option can be omitted.  In such cases, the current locale is used.
225
226Therefore, when both -f and -t are omitted, B<piconv> just acts
227like B<cat>.
228
229=item -s,--string I<string>
230
231uses I<string> instead of file for the source of text.
232
233=item -l,--list
234
235Lists all available encodings, one per line, in case-insensitive
236order.  Note that only the canonical names are listed; many aliases
237exist.  For example, the names are case-insensitive, and many standard
238and common aliases work, such as "latin1" for "ISO-8859-1", or "ibm850"
239instead of "cp850", or "winlatin1" for "cp1252".  See L<Encode::Supported>
240for a full discussion.
241
242=item -r,--resolve I<encoding_alias>
243
244Resolve I<encoding_alias> to Encode canonical encoding name.
245
246=item -C,--check I<N>
247
248Check the validity of the stream if I<N> = 1.  When I<N> = -1, something
249interesting happens when it encounters an invalid character.
250
251=item -c
252
253Same as C<-C 1>.
254
255=item -p,--perlqq
256
257Transliterate characters missing in encoding to \x{HHHH} where HHHH is the
258hexadecimal Unicode code point.
259
260=item --htmlcref
261
262Transliterate characters missing in encoding to &#NNN; where NNN is the
263decimal Unicode code point.
264
265=item --xmlcref
266
267Transliterate characters missing in encoding to &#xHHHH; where HHHH is the
268hexadecimal Unicode code point.
269
270=item -h,--help
271
272Show usage.
273
274=item -D,--debug
275
276Invokes debugging mode.  Primarily for Encode hackers.
277
278=item -S,--scheme I<scheme>
279
280Selects which scheme is to be used for conversion.  Available schemes
281are as follows:
282
283=over 4
284
285=item from_to
286
287Uses Encode::from_to for conversion.  This is the default.
288
289=item decode_encode
290
291Input strings are decode()d then encode()d.  A straight two-step
292implementation.
293
294=item perlio
295
296The new perlIO layer is used.  NI-S' favorite.
297
298You should use this option if you are using UTF-16 and others which
299linefeed is not $/.
300
301=back
302
303Like the I<-D> option, this is also for Encode hackers.
304
305=back
306
307=head1 SEE ALSO
308
309L<iconv(1)>
310L<locale(3)>
311L<Encode>
312L<Encode::Supported>
313L<Encode::Alias>
314L<PerlIO>
315
316=cut
317