1#!./perl 2# $Id: piconv,v 2.6 2014/03/28 02:37:42 dankogai Exp $ 3# 4BEGIN { pop @INC if $INC[-1] eq '.' } 5use 5.8.0; 6use strict; 7use Encode ; 8use Encode::Alias; 9my %Scheme = map {$_ => 1} qw(from_to decode_encode perlio); 10 11use File::Basename; 12my $name = basename($0); 13 14use Getopt::Long qw(:config no_ignore_case); 15 16my %Opt; 17 18help() 19 unless 20 GetOptions(\%Opt, 21 'from|f=s', 22 'to|t=s', 23 'list|l', 24 'string|s=s', 25 'check|C=i', 26 'c', 27 'perlqq|p', 28 'htmlcref', 29 'xmlcref', 30 'debug|D', 31 'scheme|S=s', 32 'resolve|r=s', 33 'help', 34 ); 35 36$Opt{help} and help(); 37$Opt{list} and list_encodings(); 38my $locale = $ENV{LC_CTYPE} || $ENV{LC_ALL} || $ENV{LANG}; 39defined $Opt{resolve} and resolve_encoding($Opt{resolve}); 40$Opt{from} || $Opt{to} || help(); 41my $from = $Opt{from} || $locale or help("from_encoding unspecified"); 42my $to = $Opt{to} || $locale or help("to_encoding unspecified"); 43$Opt{string} and Encode::from_to($Opt{string}, $from, $to) and print $Opt{string} and exit; 44my $scheme = do { 45 if (defined $Opt{scheme}) { 46 if (!exists $Scheme{$Opt{scheme}}) { 47 warn "Unknown scheme '$Opt{scheme}', fallback to 'from_to'.\n"; 48 'from_to'; 49 } else { 50 $Opt{scheme}; 51 } 52 } else { 53 'from_to'; 54 } 55}; 56 57$Opt{check} ||= $Opt{c}; 58$Opt{perlqq} and $Opt{check} = Encode::PERLQQ; 59$Opt{htmlcref} and $Opt{check} = Encode::HTMLCREF; 60$Opt{xmlcref} and $Opt{check} = Encode::XMLCREF; 61 62if ($Opt{debug}){ 63 my $cfrom = Encode->getEncoding($from)->name; 64 my $cto = Encode->getEncoding($to)->name; 65 print <<"EOT"; 66Scheme: $scheme 67From: $from => $cfrom 68To: $to => $cto 69EOT 70} 71 72my %use_bom = 73 map { $_ => 1 } qw/UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE/; 74 75# we do not use <> (or ARGV) for the sake of binmode() 76@ARGV or push @ARGV, \*STDIN; 77 78unless ( $scheme eq 'perlio' ) { 79 binmode STDOUT; 80 my $need2slurp = $use_bom{ find_encoding($to)->name } 81 || $use_bom{ find_encoding($from)->name }; 82 for my $argv (@ARGV) { 83 my $ifh = ref $argv ? $argv : undef; 84 $ifh or open $ifh, "<", $argv or warn "Can't open $argv: $!" and next; 85 $ifh or open $ifh, "<", $argv or next; 86 binmode $ifh; 87 if ( $scheme eq 'from_to' ) { # default 88 if ($need2slurp){ 89 local $/; 90 $_ = <$ifh>; 91 Encode::from_to( $_, $from, $to, $Opt{check} ); 92 print; 93 }else{ 94 while (<$ifh>) { 95 Encode::from_to( $_, $from, $to, $Opt{check} ); 96 print; 97 } 98 } 99 } 100 elsif ( $scheme eq 'decode_encode' ) { # step-by-step 101 if ($need2slurp){ 102 local $/; 103 $_ = <$ifh>; 104 my $decoded = decode( $from, $_, $Opt{check} ); 105 my $encoded = encode( $to, $decoded ); 106 print $encoded; 107 }else{ 108 while (<$ifh>) { 109 my $decoded = decode( $from, $_, $Opt{check} ); 110 my $encoded = encode( $to, $decoded ); 111 print $encoded; 112 } 113 } 114 } 115 else { # won't reach 116 die "$name: unknown scheme: $scheme"; 117 } 118 } 119} 120else { 121 122 # NI-S favorite 123 binmode STDOUT => "raw:encoding($to)"; 124 for my $argv (@ARGV) { 125 my $ifh = ref $argv ? $argv : undef; 126 $ifh or open $ifh, "<", $argv or warn "Can't open $argv: $!" and next; 127 $ifh or open $ifh, "<", $argv or next; 128 binmode $ifh => "raw:encoding($from)"; 129 print while (<$ifh>); 130 } 131} 132 133sub list_encodings { 134 print join( "\n", Encode->encodings(":all") ), "\n"; 135 exit 0; 136} 137 138sub resolve_encoding { 139 if ( my $alias = Encode::resolve_alias( $_[0] ) ) { 140 print $alias, "\n"; 141 exit 0; 142 } 143 else { 144 warn "$name: $_[0] is not known to Encode\n"; 145 exit 1; 146 } 147} 148 149sub help { 150 my $message = shift; 151 $message and print STDERR "$name error: $message\n"; 152 print STDERR <<"EOT"; 153$name [-f from_encoding] [-t to_encoding] 154 [-p|--perlqq|--htmlcref|--xmlcref] [-C N|-c] [-D] [-S scheme] 155 [-s string|file...] 156$name -l 157$name -r encoding_alias 158$name -h 159Common options: 160 -l,--list 161 lists all available encodings 162 -r,--resolve encoding_alias 163 resolve encoding to its (Encode) canonical name 164 -f,--from from_encoding 165 when omitted, the current locale will be used 166 -t,--to to_encoding 167 when omitted, the current locale will be used 168 -s,--string string 169 "string" will be the input instead of STDIN or files 170The following are mainly of interest to Encode hackers: 171 -C N | -c check the validity of the input 172 -D,--debug show debug information 173 -S,--scheme scheme use the scheme for conversion 174Those are handy when you can only see ASCII characters: 175 -p,--perlqq transliterate characters missing in encoding to \\x{HHHH} 176 where HHHH is the hexadecimal Unicode code point 177 --htmlcref transliterate characters missing in encoding to &#NNN; 178 where NNN is the decimal Unicode code point 179 --xmlcref transliterate characters missing in encoding to &#xHHHH; 180 where HHHH is the hexadecimal Unicode code point 181 182EOT 183 exit; 184} 185 186__END__ 187 188=head1 NAME 189 190piconv -- iconv(1), reinvented in perl 191 192=head1 SYNOPSIS 193 194 piconv [-f from_encoding] [-t to_encoding] 195 [-p|--perlqq|--htmlcref|--xmlcref] [-C N|-c] [-D] [-S scheme] 196 [-s string|file...] 197 piconv -l 198 piconv -r encoding_alias 199 piconv -h 200 201=head1 DESCRIPTION 202 203B<piconv> is perl version of B<iconv>, a character encoding converter 204widely available for various Unixen today. This script was primarily 205a technology demonstrator for Perl 5.8.0, but you can use piconv in the 206place of iconv for virtually any case. 207 208piconv converts the character encoding of either STDIN or files 209specified in the argument and prints out to STDOUT. 210 211Here is the list of options. Some options can be in short format (-f) 212or long (--from) one. 213 214=over 4 215 216=item -f,--from I<from_encoding> 217 218Specifies the encoding you are converting from. Unlike B<iconv>, 219this option can be omitted. In such cases, the current locale is used. 220 221=item -t,--to I<to_encoding> 222 223Specifies the encoding you are converting to. Unlike B<iconv>, 224this option can be omitted. In such cases, the current locale is used. 225 226Therefore, when both -f and -t are omitted, B<piconv> just acts 227like B<cat>. 228 229=item -s,--string I<string> 230 231uses I<string> instead of file for the source of text. 232 233=item -l,--list 234 235Lists all available encodings, one per line, in case-insensitive 236order. Note that only the canonical names are listed; many aliases 237exist. For example, the names are case-insensitive, and many standard 238and common aliases work, such as "latin1" for "ISO-8859-1", or "ibm850" 239instead of "cp850", or "winlatin1" for "cp1252". See L<Encode::Supported> 240for a full discussion. 241 242=item -r,--resolve I<encoding_alias> 243 244Resolve I<encoding_alias> to Encode canonical encoding name. 245 246=item -C,--check I<N> 247 248Check the validity of the stream if I<N> = 1. When I<N> = -1, something 249interesting happens when it encounters an invalid character. 250 251=item -c 252 253Same as C<-C 1>. 254 255=item -p,--perlqq 256 257Transliterate characters missing in encoding to \x{HHHH} where HHHH is the 258hexadecimal Unicode code point. 259 260=item --htmlcref 261 262Transliterate characters missing in encoding to &#NNN; where NNN is the 263decimal Unicode code point. 264 265=item --xmlcref 266 267Transliterate characters missing in encoding to &#xHHHH; where HHHH is the 268hexadecimal Unicode code point. 269 270=item -h,--help 271 272Show usage. 273 274=item -D,--debug 275 276Invokes debugging mode. Primarily for Encode hackers. 277 278=item -S,--scheme I<scheme> 279 280Selects which scheme is to be used for conversion. Available schemes 281are as follows: 282 283=over 4 284 285=item from_to 286 287Uses Encode::from_to for conversion. This is the default. 288 289=item decode_encode 290 291Input strings are decode()d then encode()d. A straight two-step 292implementation. 293 294=item perlio 295 296The new perlIO layer is used. NI-S' favorite. 297 298You should use this option if you are using UTF-16 and others which 299linefeed is not $/. 300 301=back 302 303Like the I<-D> option, this is also for Encode hackers. 304 305=back 306 307=head1 SEE ALSO 308 309L<iconv(1)> 310L<locale(3)> 311L<Encode> 312L<Encode::Supported> 313L<Encode::Alias> 314L<PerlIO> 315 316=cut 317