1package ExtUtils::MakeMaker::Locale; 2 3use strict; 4our $VERSION = "7.44"; 5$VERSION =~ tr/_//d; 6 7use base 'Exporter'; 8our @EXPORT_OK = qw( 9 decode_argv env 10 $ENCODING_LOCALE $ENCODING_LOCALE_FS 11 $ENCODING_CONSOLE_IN $ENCODING_CONSOLE_OUT 12); 13 14use Encode (); 15use Encode::Alias (); 16 17our $ENCODING_LOCALE; 18our $ENCODING_LOCALE_FS; 19our $ENCODING_CONSOLE_IN; 20our $ENCODING_CONSOLE_OUT; 21 22sub DEBUG () { 0 } 23 24sub _init { 25 if ($^O eq "MSWin32") { 26 unless ($ENCODING_LOCALE) { 27 # Try to obtain what the Windows ANSI code page is 28 eval { 29 unless (defined &GetConsoleCP) { 30 require Win32; 31 # manually "import" it since Win32->import refuses 32 *GetConsoleCP = sub { &Win32::GetConsoleCP } if defined &Win32::GetConsoleCP; 33 } 34 unless (defined &GetConsoleCP) { 35 require Win32::API; 36 Win32::API->Import('kernel32', 'int GetConsoleCP()'); 37 } 38 if (defined &GetConsoleCP) { 39 my $cp = GetConsoleCP(); 40 $ENCODING_LOCALE = "cp$cp" if $cp; 41 } 42 }; 43 } 44 45 unless ($ENCODING_CONSOLE_IN) { 46 # only test one since set together 47 unless (defined &GetInputCP) { 48 eval { 49 require Win32; 50 eval { 51 local $SIG{__WARN__} = sub {} if ( "$]" < 5.014 ); # suppress deprecation warning for inherited AUTOLOAD of Win32::GetConsoleCP() 52 Win32::GetConsoleCP(); 53 }; 54 # manually "import" it since Win32->import refuses 55 *GetInputCP = sub { &Win32::GetConsoleCP } if defined &Win32::GetConsoleCP; 56 *GetOutputCP = sub { &Win32::GetConsoleOutputCP } if defined &Win32::GetConsoleOutputCP; 57 }; 58 unless (defined &GetInputCP) { 59 eval { 60 # try Win32::Console module for codepage to use 61 require Win32::Console; 62 *GetInputCP = sub { &Win32::Console::InputCP } 63 if defined &Win32::Console::InputCP; 64 *GetOutputCP = sub { &Win32::Console::OutputCP } 65 if defined &Win32::Console::OutputCP; 66 }; 67 } 68 unless (defined &GetInputCP) { 69 # final fallback 70 *GetInputCP = *GetOutputCP = sub { 71 # another fallback that could work is: 72 # reg query HKLM\System\CurrentControlSet\Control\Nls\CodePage /v ACP 73 ((qx(chcp) || '') =~ /^Active code page: (\d+)/) 74 ? $1 : (); 75 }; 76 } 77 } 78 my $cp = GetInputCP(); 79 $ENCODING_CONSOLE_IN = "cp$cp" if $cp; 80 $cp = GetOutputCP(); 81 $ENCODING_CONSOLE_OUT = "cp$cp" if $cp; 82 } 83 } 84 85 unless ($ENCODING_LOCALE) { 86 eval { 87 require I18N::Langinfo; 88 $ENCODING_LOCALE = I18N::Langinfo::langinfo(I18N::Langinfo::CODESET()); 89 90 # Workaround of Encode < v2.25. The "646" encoding alias was 91 # introduced in Encode-2.25, but we don't want to require that version 92 # quite yet. Should avoid the CPAN testers failure reported from 93 # openbsd-4.7/perl-5.10.0 combo. 94 $ENCODING_LOCALE = "ascii" if $ENCODING_LOCALE eq "646"; 95 96 # https://rt.cpan.org/Ticket/Display.html?id=66373 97 $ENCODING_LOCALE = "hp-roman8" if $^O eq "hpux" && $ENCODING_LOCALE eq "roman8"; 98 }; 99 $ENCODING_LOCALE ||= $ENCODING_CONSOLE_IN; 100 } 101 102 # Workaround of Encode < v2.71 for "cp65000" and "cp65001" 103 # The "cp65000" and "cp65001" aliases were added in [Encode v2.71](https://github.com/dankogai/p5-encode/commit/7874bd95aa10967a3b5dbae333d16bcd703ac6c6) 104 # via commit <https://github.com/dankogai/p5-encode/commit/84b9c1101d5251d37e226f80d1c6781718779047>. 105 # This will avoid test failures for Win32 machines using the UTF-7 or UTF-8 code pages. 106 $ENCODING_LOCALE = 'UTF-7' if $ENCODING_LOCALE && lc($ENCODING_LOCALE) eq "cp65000"; 107 $ENCODING_LOCALE = 'utf-8-strict' if $ENCODING_LOCALE && lc($ENCODING_LOCALE) eq "cp65001"; 108 109 if ($^O eq "darwin") { 110 $ENCODING_LOCALE_FS ||= "UTF-8"; 111 } 112 113 # final fallback 114 $ENCODING_LOCALE ||= $^O eq "MSWin32" ? "cp1252" : "UTF-8"; 115 $ENCODING_LOCALE_FS ||= $ENCODING_LOCALE; 116 $ENCODING_CONSOLE_IN ||= $ENCODING_LOCALE; 117 $ENCODING_CONSOLE_OUT ||= $ENCODING_CONSOLE_IN; 118 119 unless (Encode::find_encoding($ENCODING_LOCALE)) { 120 my $foundit; 121 if (lc($ENCODING_LOCALE) eq "gb18030") { 122 eval { 123 require Encode::HanExtra; 124 }; 125 if ($@) { 126 die "Need Encode::HanExtra to be installed to support locale codeset ($ENCODING_LOCALE), stopped"; 127 } 128 $foundit++ if Encode::find_encoding($ENCODING_LOCALE); 129 } 130 die "The locale codeset ($ENCODING_LOCALE) isn't one that perl can decode, stopped" 131 unless $foundit; 132 133 } 134 135 # use Data::Dump; ddx $ENCODING_LOCALE, $ENCODING_LOCALE_FS, $ENCODING_CONSOLE_IN, $ENCODING_CONSOLE_OUT; 136} 137 138_init(); 139Encode::Alias::define_alias(sub { 140 no strict 'refs'; 141 no warnings 'once'; 142 return ${"ENCODING_" . uc(shift)}; 143}, "locale"); 144 145sub _flush_aliases { 146 no strict 'refs'; 147 for my $a (sort keys %Encode::Alias::Alias) { 148 if (defined ${"ENCODING_" . uc($a)}) { 149 delete $Encode::Alias::Alias{$a}; 150 warn "Flushed alias cache for $a" if DEBUG; 151 } 152 } 153} 154 155sub reinit { 156 $ENCODING_LOCALE = shift; 157 $ENCODING_LOCALE_FS = shift; 158 $ENCODING_CONSOLE_IN = $ENCODING_LOCALE; 159 $ENCODING_CONSOLE_OUT = $ENCODING_LOCALE; 160 _init(); 161 _flush_aliases(); 162} 163 164sub decode_argv { 165 die if defined wantarray; 166 for (@ARGV) { 167 $_ = Encode::decode(locale => $_, @_); 168 } 169} 170 171sub env { 172 my $k = Encode::encode(locale => shift); 173 my $old = $ENV{$k}; 174 if (@_) { 175 my $v = shift; 176 if (defined $v) { 177 $ENV{$k} = Encode::encode(locale => $v); 178 } 179 else { 180 delete $ENV{$k}; 181 } 182 } 183 return Encode::decode(locale => $old) if defined wantarray; 184} 185 1861; 187 188__END__ 189 190=head1 NAME 191 192ExtUtils::MakeMaker::Locale - bundled Encode::Locale 193 194=head1 SYNOPSIS 195 196 use Encode::Locale; 197 use Encode; 198 199 $string = decode(locale => $bytes); 200 $bytes = encode(locale => $string); 201 202 if (-t) { 203 binmode(STDIN, ":encoding(console_in)"); 204 binmode(STDOUT, ":encoding(console_out)"); 205 binmode(STDERR, ":encoding(console_out)"); 206 } 207 208 # Processing file names passed in as arguments 209 my $uni_filename = decode(locale => $ARGV[0]); 210 open(my $fh, "<", encode(locale_fs => $uni_filename)) 211 || die "Can't open '$uni_filename': $!"; 212 binmode($fh, ":encoding(locale)"); 213 ... 214 215=head1 DESCRIPTION 216 217In many applications it's wise to let Perl use Unicode for the strings it 218processes. Most of the interfaces Perl has to the outside world are still byte 219based. Programs therefore need to decode byte strings that enter the program 220from the outside and encode them again on the way out. 221 222The POSIX locale system is used to specify both the language conventions 223requested by the user and the preferred character set to consume and 224output. The C<Encode::Locale> module looks up the charset and encoding (called 225a CODESET in the locale jargon) and arranges for the L<Encode> module to know 226this encoding under the name "locale". It means bytes obtained from the 227environment can be converted to Unicode strings by calling C<< 228Encode::encode(locale => $bytes) >> and converted back again with C<< 229Encode::decode(locale => $string) >>. 230 231Where file systems interfaces pass file names in and out of the program we also 232need care. The trend is for operating systems to use a fixed file encoding 233that don't actually depend on the locale; and this module determines the most 234appropriate encoding for file names. The L<Encode> module will know this 235encoding under the name "locale_fs". For traditional Unix systems this will 236be an alias to the same encoding as "locale". 237 238For programs running in a terminal window (called a "Console" on some systems) 239the "locale" encoding is usually a good choice for what to expect as input and 240output. Some systems allows us to query the encoding set for the terminal and 241C<Encode::Locale> will do that if available and make these encodings known 242under the C<Encode> aliases "console_in" and "console_out". For systems where 243we can't determine the terminal encoding these will be aliased as the same 244encoding as "locale". The advice is to use "console_in" for input known to 245come from the terminal and "console_out" for output to the terminal. 246 247In addition to arranging for various Encode aliases the following functions and 248variables are provided: 249 250=over 251 252=item decode_argv( ) 253 254=item decode_argv( Encode::FB_CROAK ) 255 256This will decode the command line arguments to perl (the C<@ARGV> array) in-place. 257 258The function will by default replace characters that can't be decoded by 259"\x{FFFD}", the Unicode replacement character. 260 261Any argument provided is passed as CHECK to underlying Encode::decode() call. 262Pass the value C<Encode::FB_CROAK> to have the decoding croak if not all the 263command line arguments can be decoded. See L<Encode/"Handling Malformed Data"> 264for details on other options for CHECK. 265 266=item env( $uni_key ) 267 268=item env( $uni_key => $uni_value ) 269 270Interface to get/set environment variables. Returns the current value as a 271Unicode string. The $uni_key and $uni_value arguments are expected to be 272Unicode strings as well. Passing C<undef> as $uni_value deletes the 273environment variable named $uni_key. 274 275The returned value will have the characters that can't be decoded replaced by 276"\x{FFFD}", the Unicode replacement character. 277 278There is no interface to request alternative CHECK behavior as for 279decode_argv(). If you need that you need to call encode/decode yourself. 280For example: 281 282 my $key = Encode::encode(locale => $uni_key, Encode::FB_CROAK); 283 my $uni_value = Encode::decode(locale => $ENV{$key}, Encode::FB_CROAK); 284 285=item reinit( ) 286 287=item reinit( $encoding ) 288 289Reinitialize the encodings from the locale. You want to call this function if 290you changed anything in the environment that might influence the locale. 291 292This function will croak if the determined encoding isn't recognized by 293the Encode module. 294 295With argument force $ENCODING_... variables to set to the given value. 296 297=item $ENCODING_LOCALE 298 299The encoding name determined to be suitable for the current locale. 300L<Encode> know this encoding as "locale". 301 302=item $ENCODING_LOCALE_FS 303 304The encoding name determined to be suitable for file system interfaces 305involving file names. 306L<Encode> know this encoding as "locale_fs". 307 308=item $ENCODING_CONSOLE_IN 309 310=item $ENCODING_CONSOLE_OUT 311 312The encodings to be used for reading and writing output to the a console. 313L<Encode> know these encodings as "console_in" and "console_out". 314 315=back 316 317=head1 NOTES 318 319This table summarizes the mapping of the encodings set up 320by the C<Encode::Locale> module: 321 322 Encode | | | 323 Alias | Windows | Mac OS X | POSIX 324 ------------+---------+--------------+------------ 325 locale | ANSI | nl_langinfo | nl_langinfo 326 locale_fs | ANSI | UTF-8 | nl_langinfo 327 console_in | OEM | nl_langinfo | nl_langinfo 328 console_out | OEM | nl_langinfo | nl_langinfo 329 330=head2 Windows 331 332Windows has basically 2 sets of APIs. A wide API (based on passing UTF-16 333strings) and a byte based API based a character set called ANSI. The 334regular Perl interfaces to the OS currently only uses the ANSI APIs. 335Unfortunately ANSI is not a single character set. 336 337The encoding that corresponds to ANSI varies between different editions of 338Windows. For many western editions of Windows ANSI corresponds to CP-1252 339which is a character set similar to ISO-8859-1. Conceptually the ANSI 340character set is a similar concept to the POSIX locale CODESET so this module 341figures out what the ANSI code page is and make this available as 342$ENCODING_LOCALE and the "locale" Encoding alias. 343 344Windows systems also operate with another byte based character set. 345It's called the OEM code page. This is the encoding that the Console 346takes as input and output. It's common for the OEM code page to 347differ from the ANSI code page. 348 349=head2 Mac OS X 350 351On Mac OS X the file system encoding is always UTF-8 while the locale 352can otherwise be set up as normal for POSIX systems. 353 354File names on Mac OS X will at the OS-level be converted to 355NFD-form. A file created by passing a NFC-filename will come 356in NFD-form from readdir(). See L<Unicode::Normalize> for details 357of NFD/NFC. 358 359Actually, Apple does not follow the Unicode NFD standard since not all 360character ranges are decomposed. The claim is that this avoids problems with 361round trip conversions from old Mac text encodings. See L<Encode::UTF8Mac> for 362details. 363 364=head2 POSIX (Linux and other Unixes) 365 366File systems might vary in what encoding is to be used for 367filenames. Since this module has no way to actually figure out 368what the is correct it goes with the best guess which is to 369assume filenames are encoding according to the current locale. 370Users are advised to always specify UTF-8 as the locale charset. 371 372=head1 SEE ALSO 373 374L<I18N::Langinfo>, L<Encode>, L<Term::Encoding> 375 376=head1 AUTHOR 377 378Copyright 2010 Gisle Aas <gisle@aas.no>. 379 380This library is free software; you can redistribute it and/or 381modify it under the same terms as Perl itself. 382 383=cut 384