1package ExtUtils::MakeMaker::Locale; 2 3use strict; 4our $VERSION = "7.10"; 5 6use base 'Exporter'; 7our @EXPORT_OK = qw( 8 decode_argv env 9 $ENCODING_LOCALE $ENCODING_LOCALE_FS 10 $ENCODING_CONSOLE_IN $ENCODING_CONSOLE_OUT 11); 12 13use Encode (); 14use Encode::Alias (); 15 16our $ENCODING_LOCALE; 17our $ENCODING_LOCALE_FS; 18our $ENCODING_CONSOLE_IN; 19our $ENCODING_CONSOLE_OUT; 20 21sub DEBUG () { 0 } 22 23sub _init { 24 if ($^O eq "MSWin32") { 25 unless ($ENCODING_LOCALE) { 26 # Try to obtain what the Windows ANSI code page is 27 eval { 28 unless (defined &GetConsoleCP) { 29 require Win32; 30 # no point falling back to Win32::GetConsoleCP from this 31 # as added same time, 0.45 32 eval { Win32::GetConsoleCP() }; 33 # manually "import" it since Win32->import refuses 34 *GetConsoleCP = sub { &Win32::GetConsoleCP } unless $@; 35 } 36 unless (defined &GetConsoleCP) { 37 require Win32::API; 38 Win32::API->Import('kernel32', 'int GetConsoleCP()'); 39 } 40 if (defined &GetConsoleCP) { 41 my $cp = GetConsoleCP(); 42 $ENCODING_LOCALE = "cp$cp" if $cp; 43 } 44 }; 45 } 46 47 unless ($ENCODING_CONSOLE_IN) { 48 # only test one since set together 49 unless (defined &GetInputCP) { 50 eval { 51 require Win32; 52 eval { Win32::GetConsoleCP() }; 53 # manually "import" it since Win32->import refuses 54 *GetInputCP = sub { &Win32::GetConsoleCP } unless $@; 55 *GetOutputCP = sub { &Win32::GetConsoleOutputCP } unless $@; 56 }; 57 unless (defined &GetInputCP) { 58 eval { 59 # try Win32::Console module for codepage to use 60 require Win32::Console; 61 eval { Win32::Console::InputCP() }; 62 *GetInputCP = sub { &Win32::Console::InputCP } 63 unless $@; 64 *GetOutputCP = sub { &Win32::Console::OutputCP } 65 unless $@; 66 }; 67 } 68 unless (defined &GetInputCP) { 69 # final fallback 70 *GetInputCP = *GetOutputCP = sub { 71 # another fallback that could work is: 72 # reg query HKLM\System\CurrentControlSet\Control\Nls\CodePage /v ACP 73 ((qx(chcp) || '') =~ /^Active code page: (\d+)/) 74 ? $1 : (); 75 }; 76 } 77 } 78 my $cp = GetInputCP(); 79 $ENCODING_CONSOLE_IN = "cp$cp" if $cp; 80 $cp = GetOutputCP(); 81 $ENCODING_CONSOLE_OUT = "cp$cp" if $cp; 82 } 83 } 84 85 unless ($ENCODING_LOCALE) { 86 eval { 87 require I18N::Langinfo; 88 $ENCODING_LOCALE = I18N::Langinfo::langinfo(I18N::Langinfo::CODESET()); 89 90 # Workaround of Encode < v2.25. The "646" encoding alias was 91 # introduced in Encode-2.25, but we don't want to require that version 92 # quite yet. Should avoid the CPAN testers failure reported from 93 # openbsd-4.7/perl-5.10.0 combo. 94 $ENCODING_LOCALE = "ascii" if $ENCODING_LOCALE eq "646"; 95 96 # https://rt.cpan.org/Ticket/Display.html?id=66373 97 $ENCODING_LOCALE = "hp-roman8" if $^O eq "hpux" && $ENCODING_LOCALE eq "roman8"; 98 }; 99 $ENCODING_LOCALE ||= $ENCODING_CONSOLE_IN; 100 } 101 102 if ($^O eq "darwin") { 103 $ENCODING_LOCALE_FS ||= "UTF-8"; 104 } 105 106 # final fallback 107 $ENCODING_LOCALE ||= $^O eq "MSWin32" ? "cp1252" : "UTF-8"; 108 $ENCODING_LOCALE_FS ||= $ENCODING_LOCALE; 109 $ENCODING_CONSOLE_IN ||= $ENCODING_LOCALE; 110 $ENCODING_CONSOLE_OUT ||= $ENCODING_CONSOLE_IN; 111 112 unless (Encode::find_encoding($ENCODING_LOCALE)) { 113 my $foundit; 114 if (lc($ENCODING_LOCALE) eq "gb18030") { 115 eval { 116 require Encode::HanExtra; 117 }; 118 if ($@) { 119 die "Need Encode::HanExtra to be installed to support locale codeset ($ENCODING_LOCALE), stopped"; 120 } 121 $foundit++ if Encode::find_encoding($ENCODING_LOCALE); 122 } 123 die "The locale codeset ($ENCODING_LOCALE) isn't one that perl can decode, stopped" 124 unless $foundit; 125 126 } 127 128 # use Data::Dump; ddx $ENCODING_LOCALE, $ENCODING_LOCALE_FS, $ENCODING_CONSOLE_IN, $ENCODING_CONSOLE_OUT; 129} 130 131_init(); 132Encode::Alias::define_alias(sub { 133 no strict 'refs'; 134 no warnings 'once'; 135 return ${"ENCODING_" . uc(shift)}; 136}, "locale"); 137 138sub _flush_aliases { 139 no strict 'refs'; 140 for my $a (keys %Encode::Alias::Alias) { 141 if (defined ${"ENCODING_" . uc($a)}) { 142 delete $Encode::Alias::Alias{$a}; 143 warn "Flushed alias cache for $a" if DEBUG; 144 } 145 } 146} 147 148sub reinit { 149 $ENCODING_LOCALE = shift; 150 $ENCODING_LOCALE_FS = shift; 151 $ENCODING_CONSOLE_IN = $ENCODING_LOCALE; 152 $ENCODING_CONSOLE_OUT = $ENCODING_LOCALE; 153 _init(); 154 _flush_aliases(); 155} 156 157sub decode_argv { 158 die if defined wantarray; 159 for (@ARGV) { 160 $_ = Encode::decode(locale => $_, @_); 161 } 162} 163 164sub env { 165 my $k = Encode::encode(locale => shift); 166 my $old = $ENV{$k}; 167 if (@_) { 168 my $v = shift; 169 if (defined $v) { 170 $ENV{$k} = Encode::encode(locale => $v); 171 } 172 else { 173 delete $ENV{$k}; 174 } 175 } 176 return Encode::decode(locale => $old) if defined wantarray; 177} 178 1791; 180 181__END__ 182 183=head1 NAME 184 185ExtUtils::MakeMaker::Locale - bundled Encode::Locale 186 187=head1 SYNOPSIS 188 189 use Encode::Locale; 190 use Encode; 191 192 $string = decode(locale => $bytes); 193 $bytes = encode(locale => $string); 194 195 if (-t) { 196 binmode(STDIN, ":encoding(console_in)"); 197 binmode(STDOUT, ":encoding(console_out)"); 198 binmode(STDERR, ":encoding(console_out)"); 199 } 200 201 # Processing file names passed in as arguments 202 my $uni_filename = decode(locale => $ARGV[0]); 203 open(my $fh, "<", encode(locale_fs => $uni_filename)) 204 || die "Can't open '$uni_filename': $!"; 205 binmode($fh, ":encoding(locale)"); 206 ... 207 208=head1 DESCRIPTION 209 210In many applications it's wise to let Perl use Unicode for the strings it 211processes. Most of the interfaces Perl has to the outside world are still byte 212based. Programs therefore need to decode byte strings that enter the program 213from the outside and encode them again on the way out. 214 215The POSIX locale system is used to specify both the language conventions 216requested by the user and the preferred character set to consume and 217output. The C<Encode::Locale> module looks up the charset and encoding (called 218a CODESET in the locale jargon) and arranges for the L<Encode> module to know 219this encoding under the name "locale". It means bytes obtained from the 220environment can be converted to Unicode strings by calling C<< 221Encode::encode(locale => $bytes) >> and converted back again with C<< 222Encode::decode(locale => $string) >>. 223 224Where file systems interfaces pass file names in and out of the program we also 225need care. The trend is for operating systems to use a fixed file encoding 226that don't actually depend on the locale; and this module determines the most 227appropriate encoding for file names. The L<Encode> module will know this 228encoding under the name "locale_fs". For traditional Unix systems this will 229be an alias to the same encoding as "locale". 230 231For programs running in a terminal window (called a "Console" on some systems) 232the "locale" encoding is usually a good choice for what to expect as input and 233output. Some systems allows us to query the encoding set for the terminal and 234C<Encode::Locale> will do that if available and make these encodings known 235under the C<Encode> aliases "console_in" and "console_out". For systems where 236we can't determine the terminal encoding these will be aliased as the same 237encoding as "locale". The advice is to use "console_in" for input known to 238come from the terminal and "console_out" for output to the terminal. 239 240In addition to arranging for various Encode aliases the following functions and 241variables are provided: 242 243=over 244 245=item decode_argv( ) 246 247=item decode_argv( Encode::FB_CROAK ) 248 249This will decode the command line arguments to perl (the C<@ARGV> array) in-place. 250 251The function will by default replace characters that can't be decoded by 252"\x{FFFD}", the Unicode replacement character. 253 254Any argument provided is passed as CHECK to underlying Encode::decode() call. 255Pass the value C<Encode::FB_CROAK> to have the decoding croak if not all the 256command line arguments can be decoded. See L<Encode/"Handling Malformed Data"> 257for details on other options for CHECK. 258 259=item env( $uni_key ) 260 261=item env( $uni_key => $uni_value ) 262 263Interface to get/set environment variables. Returns the current value as a 264Unicode string. The $uni_key and $uni_value arguments are expected to be 265Unicode strings as well. Passing C<undef> as $uni_value deletes the 266environment variable named $uni_key. 267 268The returned value will have the characters that can't be decoded replaced by 269"\x{FFFD}", the Unicode replacement character. 270 271There is no interface to request alternative CHECK behavior as for 272decode_argv(). If you need that you need to call encode/decode yourself. 273For example: 274 275 my $key = Encode::encode(locale => $uni_key, Encode::FB_CROAK); 276 my $uni_value = Encode::decode(locale => $ENV{$key}, Encode::FB_CROAK); 277 278=item reinit( ) 279 280=item reinit( $encoding ) 281 282Reinitialize the encodings from the locale. You want to call this function if 283you changed anything in the environment that might influence the locale. 284 285This function will croak if the determined encoding isn't recognized by 286the Encode module. 287 288With argument force $ENCODING_... variables to set to the given value. 289 290=item $ENCODING_LOCALE 291 292The encoding name determined to be suitable for the current locale. 293L<Encode> know this encoding as "locale". 294 295=item $ENCODING_LOCALE_FS 296 297The encoding name determined to be suitable for file system interfaces 298involving file names. 299L<Encode> know this encoding as "locale_fs". 300 301=item $ENCODING_CONSOLE_IN 302 303=item $ENCODING_CONSOLE_OUT 304 305The encodings to be used for reading and writing output to the a console. 306L<Encode> know these encodings as "console_in" and "console_out". 307 308=back 309 310=head1 NOTES 311 312This table summarizes the mapping of the encodings set up 313by the C<Encode::Locale> module: 314 315 Encode | | | 316 Alias | Windows | Mac OS X | POSIX 317 ------------+---------+--------------+------------ 318 locale | ANSI | nl_langinfo | nl_langinfo 319 locale_fs | ANSI | UTF-8 | nl_langinfo 320 console_in | OEM | nl_langinfo | nl_langinfo 321 console_out | OEM | nl_langinfo | nl_langinfo 322 323=head2 Windows 324 325Windows has basically 2 sets of APIs. A wide API (based on passing UTF-16 326strings) and a byte based API based a character set called ANSI. The 327regular Perl interfaces to the OS currently only uses the ANSI APIs. 328Unfortunately ANSI is not a single character set. 329 330The encoding that corresponds to ANSI varies between different editions of 331Windows. For many western editions of Windows ANSI corresponds to CP-1252 332which is a character set similar to ISO-8859-1. Conceptually the ANSI 333character set is a similar concept to the POSIX locale CODESET so this module 334figures out what the ANSI code page is and make this available as 335$ENCODING_LOCALE and the "locale" Encoding alias. 336 337Windows systems also operate with another byte based character set. 338It's called the OEM code page. This is the encoding that the Console 339takes as input and output. It's common for the OEM code page to 340differ from the ANSI code page. 341 342=head2 Mac OS X 343 344On Mac OS X the file system encoding is always UTF-8 while the locale 345can otherwise be set up as normal for POSIX systems. 346 347File names on Mac OS X will at the OS-level be converted to 348NFD-form. A file created by passing a NFC-filename will come 349in NFD-form from readdir(). See L<Unicode::Normalize> for details 350of NFD/NFC. 351 352Actually, Apple does not follow the Unicode NFD standard since not all 353character ranges are decomposed. The claim is that this avoids problems with 354round trip conversions from old Mac text encodings. See L<Encode::UTF8Mac> for 355details. 356 357=head2 POSIX (Linux and other Unixes) 358 359File systems might vary in what encoding is to be used for 360filenames. Since this module has no way to actually figure out 361what the is correct it goes with the best guess which is to 362assume filenames are encoding according to the current locale. 363Users are advised to always specify UTF-8 as the locale charset. 364 365=head1 SEE ALSO 366 367L<I18N::Langinfo>, L<Encode>, L<Term::Encoding> 368 369=head1 AUTHOR 370 371Copyright 2010 Gisle Aas <gisle@aas.no>. 372 373This library is free software; you can redistribute it and/or 374modify it under the same terms as Perl itself. 375 376=cut 377