xref: /openbsd-src/gnu/usr.bin/perl/cpan/ExtUtils-MakeMaker/lib/ExtUtils/MakeMaker/Locale.pm (revision 824adb5411e4389b29bae28eba5c2c2bbd147f34)
1package ExtUtils::MakeMaker::Locale;
2
3use strict;
4our $VERSION = "7.44";
5$VERSION =~ tr/_//d;
6
7use base 'Exporter';
8our @EXPORT_OK = qw(
9    decode_argv env
10    $ENCODING_LOCALE $ENCODING_LOCALE_FS
11    $ENCODING_CONSOLE_IN $ENCODING_CONSOLE_OUT
12);
13
14use Encode ();
15use Encode::Alias ();
16
17our $ENCODING_LOCALE;
18our $ENCODING_LOCALE_FS;
19our $ENCODING_CONSOLE_IN;
20our $ENCODING_CONSOLE_OUT;
21
22sub DEBUG () { 0 }
23
24sub _init {
25    if ($^O eq "MSWin32") {
26	unless ($ENCODING_LOCALE) {
27	    # Try to obtain what the Windows ANSI code page is
28	    eval {
29		unless (defined &GetConsoleCP) {
30		    require Win32;
31                    # manually "import" it since Win32->import refuses
32		    *GetConsoleCP = sub { &Win32::GetConsoleCP } if defined &Win32::GetConsoleCP;
33		}
34		unless (defined &GetConsoleCP) {
35		    require Win32::API;
36		    Win32::API->Import('kernel32', 'int GetConsoleCP()');
37		}
38		if (defined &GetConsoleCP) {
39		    my $cp = GetConsoleCP();
40		    $ENCODING_LOCALE = "cp$cp" if $cp;
41		}
42	    };
43	}
44
45	unless ($ENCODING_CONSOLE_IN) {
46            # only test one since set together
47            unless (defined &GetInputCP) {
48                eval {
49                    require Win32;
50                    eval {
51                        local $SIG{__WARN__} = sub {} if ( "$]" < 5.014 ); # suppress deprecation warning for inherited AUTOLOAD of Win32::GetConsoleCP()
52                        Win32::GetConsoleCP();
53                    };
54                    # manually "import" it since Win32->import refuses
55                    *GetInputCP = sub { &Win32::GetConsoleCP } if defined &Win32::GetConsoleCP;
56                    *GetOutputCP = sub { &Win32::GetConsoleOutputCP } if defined &Win32::GetConsoleOutputCP;
57                };
58                unless (defined &GetInputCP) {
59                    eval {
60                        # try Win32::Console module for codepage to use
61                        require Win32::Console;
62                        *GetInputCP = sub { &Win32::Console::InputCP }
63                            if defined &Win32::Console::InputCP;
64                        *GetOutputCP = sub { &Win32::Console::OutputCP }
65                            if defined &Win32::Console::OutputCP;
66                    };
67                }
68                unless (defined &GetInputCP) {
69                    # final fallback
70                    *GetInputCP = *GetOutputCP = sub {
71                        # another fallback that could work is:
72                        # reg query HKLM\System\CurrentControlSet\Control\Nls\CodePage /v ACP
73                        ((qx(chcp) || '') =~ /^Active code page: (\d+)/)
74                            ? $1 : ();
75                    };
76                }
77	    }
78            my $cp = GetInputCP();
79            $ENCODING_CONSOLE_IN = "cp$cp" if $cp;
80            $cp = GetOutputCP();
81            $ENCODING_CONSOLE_OUT = "cp$cp" if $cp;
82	}
83    }
84
85    unless ($ENCODING_LOCALE) {
86	eval {
87	    require I18N::Langinfo;
88	    $ENCODING_LOCALE = I18N::Langinfo::langinfo(I18N::Langinfo::CODESET());
89
90	    # Workaround of Encode < v2.25.  The "646" encoding  alias was
91	    # introduced in Encode-2.25, but we don't want to require that version
92	    # quite yet.  Should avoid the CPAN testers failure reported from
93	    # openbsd-4.7/perl-5.10.0 combo.
94	    $ENCODING_LOCALE = "ascii" if $ENCODING_LOCALE eq "646";
95
96	    # https://rt.cpan.org/Ticket/Display.html?id=66373
97	    $ENCODING_LOCALE = "hp-roman8" if $^O eq "hpux" && $ENCODING_LOCALE eq "roman8";
98	};
99	$ENCODING_LOCALE ||= $ENCODING_CONSOLE_IN;
100    }
101
102    # Workaround of Encode < v2.71 for "cp65000" and "cp65001"
103    # The "cp65000" and "cp65001" aliases were added in [Encode v2.71](https://github.com/dankogai/p5-encode/commit/7874bd95aa10967a3b5dbae333d16bcd703ac6c6)
104    # via commit <https://github.com/dankogai/p5-encode/commit/84b9c1101d5251d37e226f80d1c6781718779047>.
105    # This will avoid test failures for Win32 machines using the UTF-7 or UTF-8 code pages.
106    $ENCODING_LOCALE = 'UTF-7' if $ENCODING_LOCALE && lc($ENCODING_LOCALE) eq "cp65000";
107    $ENCODING_LOCALE = 'utf-8-strict' if $ENCODING_LOCALE && lc($ENCODING_LOCALE) eq "cp65001";
108
109    if ($^O eq "darwin") {
110	$ENCODING_LOCALE_FS ||= "UTF-8";
111    }
112
113    # final fallback
114    $ENCODING_LOCALE ||= $^O eq "MSWin32" ? "cp1252" : "UTF-8";
115    $ENCODING_LOCALE_FS ||= $ENCODING_LOCALE;
116    $ENCODING_CONSOLE_IN ||= $ENCODING_LOCALE;
117    $ENCODING_CONSOLE_OUT ||= $ENCODING_CONSOLE_IN;
118
119    unless (Encode::find_encoding($ENCODING_LOCALE)) {
120	my $foundit;
121	if (lc($ENCODING_LOCALE) eq "gb18030") {
122	    eval {
123		require Encode::HanExtra;
124	    };
125	    if ($@) {
126		die "Need Encode::HanExtra to be installed to support locale codeset ($ENCODING_LOCALE), stopped";
127	    }
128	    $foundit++ if Encode::find_encoding($ENCODING_LOCALE);
129	}
130	die "The locale codeset ($ENCODING_LOCALE) isn't one that perl can decode, stopped"
131	    unless $foundit;
132
133    }
134
135    # use Data::Dump; ddx $ENCODING_LOCALE, $ENCODING_LOCALE_FS, $ENCODING_CONSOLE_IN, $ENCODING_CONSOLE_OUT;
136}
137
138_init();
139Encode::Alias::define_alias(sub {
140    no strict 'refs';
141    no warnings 'once';
142    return ${"ENCODING_" . uc(shift)};
143}, "locale");
144
145sub _flush_aliases {
146    no strict 'refs';
147    for my $a (sort keys %Encode::Alias::Alias) {
148	if (defined ${"ENCODING_" . uc($a)}) {
149	    delete $Encode::Alias::Alias{$a};
150	    warn "Flushed alias cache for $a" if DEBUG;
151	}
152    }
153}
154
155sub reinit {
156    $ENCODING_LOCALE = shift;
157    $ENCODING_LOCALE_FS = shift;
158    $ENCODING_CONSOLE_IN = $ENCODING_LOCALE;
159    $ENCODING_CONSOLE_OUT = $ENCODING_LOCALE;
160    _init();
161    _flush_aliases();
162}
163
164sub decode_argv {
165    die if defined wantarray;
166    for (@ARGV) {
167	$_ = Encode::decode(locale => $_, @_);
168    }
169}
170
171sub env {
172    my $k = Encode::encode(locale => shift);
173    my $old = $ENV{$k};
174    if (@_) {
175	my $v = shift;
176	if (defined $v) {
177	    $ENV{$k} = Encode::encode(locale => $v);
178	}
179	else {
180	    delete $ENV{$k};
181	}
182    }
183    return Encode::decode(locale => $old) if defined wantarray;
184}
185
1861;
187
188__END__
189
190=head1 NAME
191
192ExtUtils::MakeMaker::Locale - bundled Encode::Locale
193
194=head1 SYNOPSIS
195
196  use Encode::Locale;
197  use Encode;
198
199  $string = decode(locale => $bytes);
200  $bytes = encode(locale => $string);
201
202  if (-t) {
203      binmode(STDIN, ":encoding(console_in)");
204      binmode(STDOUT, ":encoding(console_out)");
205      binmode(STDERR, ":encoding(console_out)");
206  }
207
208  # Processing file names passed in as arguments
209  my $uni_filename = decode(locale => $ARGV[0]);
210  open(my $fh, "<", encode(locale_fs => $uni_filename))
211     || die "Can't open '$uni_filename': $!";
212  binmode($fh, ":encoding(locale)");
213  ...
214
215=head1 DESCRIPTION
216
217In many applications it's wise to let Perl use Unicode for the strings it
218processes.  Most of the interfaces Perl has to the outside world are still byte
219based.  Programs therefore need to decode byte strings that enter the program
220from the outside and encode them again on the way out.
221
222The POSIX locale system is used to specify both the language conventions
223requested by the user and the preferred character set to consume and
224output.  The C<Encode::Locale> module looks up the charset and encoding (called
225a CODESET in the locale jargon) and arranges for the L<Encode> module to know
226this encoding under the name "locale".  It means bytes obtained from the
227environment can be converted to Unicode strings by calling C<<
228Encode::encode(locale => $bytes) >> and converted back again with C<<
229Encode::decode(locale => $string) >>.
230
231Where file systems interfaces pass file names in and out of the program we also
232need care.  The trend is for operating systems to use a fixed file encoding
233that don't actually depend on the locale; and this module determines the most
234appropriate encoding for file names. The L<Encode> module will know this
235encoding under the name "locale_fs".  For traditional Unix systems this will
236be an alias to the same encoding as "locale".
237
238For programs running in a terminal window (called a "Console" on some systems)
239the "locale" encoding is usually a good choice for what to expect as input and
240output.  Some systems allows us to query the encoding set for the terminal and
241C<Encode::Locale> will do that if available and make these encodings known
242under the C<Encode> aliases "console_in" and "console_out".  For systems where
243we can't determine the terminal encoding these will be aliased as the same
244encoding as "locale".  The advice is to use "console_in" for input known to
245come from the terminal and "console_out" for output to the terminal.
246
247In addition to arranging for various Encode aliases the following functions and
248variables are provided:
249
250=over
251
252=item decode_argv( )
253
254=item decode_argv( Encode::FB_CROAK )
255
256This will decode the command line arguments to perl (the C<@ARGV> array) in-place.
257
258The function will by default replace characters that can't be decoded by
259"\x{FFFD}", the Unicode replacement character.
260
261Any argument provided is passed as CHECK to underlying Encode::decode() call.
262Pass the value C<Encode::FB_CROAK> to have the decoding croak if not all the
263command line arguments can be decoded.  See L<Encode/"Handling Malformed Data">
264for details on other options for CHECK.
265
266=item env( $uni_key )
267
268=item env( $uni_key => $uni_value )
269
270Interface to get/set environment variables.  Returns the current value as a
271Unicode string. The $uni_key and $uni_value arguments are expected to be
272Unicode strings as well.  Passing C<undef> as $uni_value deletes the
273environment variable named $uni_key.
274
275The returned value will have the characters that can't be decoded replaced by
276"\x{FFFD}", the Unicode replacement character.
277
278There is no interface to request alternative CHECK behavior as for
279decode_argv().  If you need that you need to call encode/decode yourself.
280For example:
281
282    my $key = Encode::encode(locale => $uni_key, Encode::FB_CROAK);
283    my $uni_value = Encode::decode(locale => $ENV{$key}, Encode::FB_CROAK);
284
285=item reinit( )
286
287=item reinit( $encoding )
288
289Reinitialize the encodings from the locale.  You want to call this function if
290you changed anything in the environment that might influence the locale.
291
292This function will croak if the determined encoding isn't recognized by
293the Encode module.
294
295With argument force $ENCODING_... variables to set to the given value.
296
297=item $ENCODING_LOCALE
298
299The encoding name determined to be suitable for the current locale.
300L<Encode> know this encoding as "locale".
301
302=item $ENCODING_LOCALE_FS
303
304The encoding name determined to be suitable for file system interfaces
305involving file names.
306L<Encode> know this encoding as "locale_fs".
307
308=item $ENCODING_CONSOLE_IN
309
310=item $ENCODING_CONSOLE_OUT
311
312The encodings to be used for reading and writing output to the a console.
313L<Encode> know these encodings as "console_in" and "console_out".
314
315=back
316
317=head1 NOTES
318
319This table summarizes the mapping of the encodings set up
320by the C<Encode::Locale> module:
321
322  Encode      |         |              |
323  Alias       | Windows | Mac OS X     | POSIX
324  ------------+---------+--------------+------------
325  locale      | ANSI    | nl_langinfo  | nl_langinfo
326  locale_fs   | ANSI    | UTF-8        | nl_langinfo
327  console_in  | OEM     | nl_langinfo  | nl_langinfo
328  console_out | OEM     | nl_langinfo  | nl_langinfo
329
330=head2 Windows
331
332Windows has basically 2 sets of APIs.  A wide API (based on passing UTF-16
333strings) and a byte based API based a character set called ANSI.  The
334regular Perl interfaces to the OS currently only uses the ANSI APIs.
335Unfortunately ANSI is not a single character set.
336
337The encoding that corresponds to ANSI varies between different editions of
338Windows.  For many western editions of Windows ANSI corresponds to CP-1252
339which is a character set similar to ISO-8859-1.  Conceptually the ANSI
340character set is a similar concept to the POSIX locale CODESET so this module
341figures out what the ANSI code page is and make this available as
342$ENCODING_LOCALE and the "locale" Encoding alias.
343
344Windows systems also operate with another byte based character set.
345It's called the OEM code page.  This is the encoding that the Console
346takes as input and output.  It's common for the OEM code page to
347differ from the ANSI code page.
348
349=head2 Mac OS X
350
351On Mac OS X the file system encoding is always UTF-8 while the locale
352can otherwise be set up as normal for POSIX systems.
353
354File names on Mac OS X will at the OS-level be converted to
355NFD-form.  A file created by passing a NFC-filename will come
356in NFD-form from readdir().  See L<Unicode::Normalize> for details
357of NFD/NFC.
358
359Actually, Apple does not follow the Unicode NFD standard since not all
360character ranges are decomposed.  The claim is that this avoids problems with
361round trip conversions from old Mac text encodings.  See L<Encode::UTF8Mac> for
362details.
363
364=head2 POSIX (Linux and other Unixes)
365
366File systems might vary in what encoding is to be used for
367filenames.  Since this module has no way to actually figure out
368what the is correct it goes with the best guess which is to
369assume filenames are encoding according to the current locale.
370Users are advised to always specify UTF-8 as the locale charset.
371
372=head1 SEE ALSO
373
374L<I18N::Langinfo>, L<Encode>, L<Term::Encoding>
375
376=head1 AUTHOR
377
378Copyright 2010 Gisle Aas <gisle@aas.no>.
379
380This library is free software; you can redistribute it and/or
381modify it under the same terms as Perl itself.
382
383=cut
384