xref: /openbsd-src/gnu/usr.bin/perl/cpan/ExtUtils-MakeMaker/lib/ExtUtils/MakeMaker/Locale.pm (revision ae3cb403620ab940fbaabb3055fac045a63d56b7)
1package ExtUtils::MakeMaker::Locale;
2
3use strict;
4our $VERSION = "7.10";
5
6use base 'Exporter';
7our @EXPORT_OK = qw(
8    decode_argv env
9    $ENCODING_LOCALE $ENCODING_LOCALE_FS
10    $ENCODING_CONSOLE_IN $ENCODING_CONSOLE_OUT
11);
12
13use Encode ();
14use Encode::Alias ();
15
16our $ENCODING_LOCALE;
17our $ENCODING_LOCALE_FS;
18our $ENCODING_CONSOLE_IN;
19our $ENCODING_CONSOLE_OUT;
20
21sub DEBUG () { 0 }
22
23sub _init {
24    if ($^O eq "MSWin32") {
25	unless ($ENCODING_LOCALE) {
26	    # Try to obtain what the Windows ANSI code page is
27	    eval {
28		unless (defined &GetConsoleCP) {
29		    require Win32;
30                    # no point falling back to Win32::GetConsoleCP from this
31                    # as added same time, 0.45
32                    eval { Win32::GetConsoleCP() };
33                    # manually "import" it since Win32->import refuses
34		    *GetConsoleCP = sub { &Win32::GetConsoleCP } unless $@;
35		}
36		unless (defined &GetConsoleCP) {
37		    require Win32::API;
38		    Win32::API->Import('kernel32', 'int GetConsoleCP()');
39		}
40		if (defined &GetConsoleCP) {
41		    my $cp = GetConsoleCP();
42		    $ENCODING_LOCALE = "cp$cp" if $cp;
43		}
44	    };
45	}
46
47	unless ($ENCODING_CONSOLE_IN) {
48            # only test one since set together
49            unless (defined &GetInputCP) {
50                eval {
51                    require Win32;
52                    eval { Win32::GetConsoleCP() };
53                    # manually "import" it since Win32->import refuses
54                    *GetInputCP = sub { &Win32::GetConsoleCP } unless $@;
55                    *GetOutputCP = sub { &Win32::GetConsoleOutputCP } unless $@;
56                };
57                unless (defined &GetInputCP) {
58                    eval {
59                        # try Win32::Console module for codepage to use
60                        require Win32::Console;
61                        eval { Win32::Console::InputCP() };
62                        *GetInputCP = sub { &Win32::Console::InputCP }
63                            unless $@;
64                        *GetOutputCP = sub { &Win32::Console::OutputCP }
65                            unless $@;
66                    };
67                }
68                unless (defined &GetInputCP) {
69                    # final fallback
70                    *GetInputCP = *GetOutputCP = sub {
71                        # another fallback that could work is:
72                        # reg query HKLM\System\CurrentControlSet\Control\Nls\CodePage /v ACP
73                        ((qx(chcp) || '') =~ /^Active code page: (\d+)/)
74                            ? $1 : ();
75                    };
76                }
77	    }
78            my $cp = GetInputCP();
79            $ENCODING_CONSOLE_IN = "cp$cp" if $cp;
80            $cp = GetOutputCP();
81            $ENCODING_CONSOLE_OUT = "cp$cp" if $cp;
82	}
83    }
84
85    unless ($ENCODING_LOCALE) {
86	eval {
87	    require I18N::Langinfo;
88	    $ENCODING_LOCALE = I18N::Langinfo::langinfo(I18N::Langinfo::CODESET());
89
90	    # Workaround of Encode < v2.25.  The "646" encoding  alias was
91	    # introduced in Encode-2.25, but we don't want to require that version
92	    # quite yet.  Should avoid the CPAN testers failure reported from
93	    # openbsd-4.7/perl-5.10.0 combo.
94	    $ENCODING_LOCALE = "ascii" if $ENCODING_LOCALE eq "646";
95
96	    # https://rt.cpan.org/Ticket/Display.html?id=66373
97	    $ENCODING_LOCALE = "hp-roman8" if $^O eq "hpux" && $ENCODING_LOCALE eq "roman8";
98	};
99	$ENCODING_LOCALE ||= $ENCODING_CONSOLE_IN;
100    }
101
102    if ($^O eq "darwin") {
103	$ENCODING_LOCALE_FS ||= "UTF-8";
104    }
105
106    # final fallback
107    $ENCODING_LOCALE ||= $^O eq "MSWin32" ? "cp1252" : "UTF-8";
108    $ENCODING_LOCALE_FS ||= $ENCODING_LOCALE;
109    $ENCODING_CONSOLE_IN ||= $ENCODING_LOCALE;
110    $ENCODING_CONSOLE_OUT ||= $ENCODING_CONSOLE_IN;
111
112    unless (Encode::find_encoding($ENCODING_LOCALE)) {
113	my $foundit;
114	if (lc($ENCODING_LOCALE) eq "gb18030") {
115	    eval {
116		require Encode::HanExtra;
117	    };
118	    if ($@) {
119		die "Need Encode::HanExtra to be installed to support locale codeset ($ENCODING_LOCALE), stopped";
120	    }
121	    $foundit++ if Encode::find_encoding($ENCODING_LOCALE);
122	}
123	die "The locale codeset ($ENCODING_LOCALE) isn't one that perl can decode, stopped"
124	    unless $foundit;
125
126    }
127
128    # use Data::Dump; ddx $ENCODING_LOCALE, $ENCODING_LOCALE_FS, $ENCODING_CONSOLE_IN, $ENCODING_CONSOLE_OUT;
129}
130
131_init();
132Encode::Alias::define_alias(sub {
133    no strict 'refs';
134    no warnings 'once';
135    return ${"ENCODING_" . uc(shift)};
136}, "locale");
137
138sub _flush_aliases {
139    no strict 'refs';
140    for my $a (keys %Encode::Alias::Alias) {
141	if (defined ${"ENCODING_" . uc($a)}) {
142	    delete $Encode::Alias::Alias{$a};
143	    warn "Flushed alias cache for $a" if DEBUG;
144	}
145    }
146}
147
148sub reinit {
149    $ENCODING_LOCALE = shift;
150    $ENCODING_LOCALE_FS = shift;
151    $ENCODING_CONSOLE_IN = $ENCODING_LOCALE;
152    $ENCODING_CONSOLE_OUT = $ENCODING_LOCALE;
153    _init();
154    _flush_aliases();
155}
156
157sub decode_argv {
158    die if defined wantarray;
159    for (@ARGV) {
160	$_ = Encode::decode(locale => $_, @_);
161    }
162}
163
164sub env {
165    my $k = Encode::encode(locale => shift);
166    my $old = $ENV{$k};
167    if (@_) {
168	my $v = shift;
169	if (defined $v) {
170	    $ENV{$k} = Encode::encode(locale => $v);
171	}
172	else {
173	    delete $ENV{$k};
174	}
175    }
176    return Encode::decode(locale => $old) if defined wantarray;
177}
178
1791;
180
181__END__
182
183=head1 NAME
184
185ExtUtils::MakeMaker::Locale - bundled Encode::Locale
186
187=head1 SYNOPSIS
188
189  use Encode::Locale;
190  use Encode;
191
192  $string = decode(locale => $bytes);
193  $bytes = encode(locale => $string);
194
195  if (-t) {
196      binmode(STDIN, ":encoding(console_in)");
197      binmode(STDOUT, ":encoding(console_out)");
198      binmode(STDERR, ":encoding(console_out)");
199  }
200
201  # Processing file names passed in as arguments
202  my $uni_filename = decode(locale => $ARGV[0]);
203  open(my $fh, "<", encode(locale_fs => $uni_filename))
204     || die "Can't open '$uni_filename': $!";
205  binmode($fh, ":encoding(locale)");
206  ...
207
208=head1 DESCRIPTION
209
210In many applications it's wise to let Perl use Unicode for the strings it
211processes.  Most of the interfaces Perl has to the outside world are still byte
212based.  Programs therefore need to decode byte strings that enter the program
213from the outside and encode them again on the way out.
214
215The POSIX locale system is used to specify both the language conventions
216requested by the user and the preferred character set to consume and
217output.  The C<Encode::Locale> module looks up the charset and encoding (called
218a CODESET in the locale jargon) and arranges for the L<Encode> module to know
219this encoding under the name "locale".  It means bytes obtained from the
220environment can be converted to Unicode strings by calling C<<
221Encode::encode(locale => $bytes) >> and converted back again with C<<
222Encode::decode(locale => $string) >>.
223
224Where file systems interfaces pass file names in and out of the program we also
225need care.  The trend is for operating systems to use a fixed file encoding
226that don't actually depend on the locale; and this module determines the most
227appropriate encoding for file names. The L<Encode> module will know this
228encoding under the name "locale_fs".  For traditional Unix systems this will
229be an alias to the same encoding as "locale".
230
231For programs running in a terminal window (called a "Console" on some systems)
232the "locale" encoding is usually a good choice for what to expect as input and
233output.  Some systems allows us to query the encoding set for the terminal and
234C<Encode::Locale> will do that if available and make these encodings known
235under the C<Encode> aliases "console_in" and "console_out".  For systems where
236we can't determine the terminal encoding these will be aliased as the same
237encoding as "locale".  The advice is to use "console_in" for input known to
238come from the terminal and "console_out" for output to the terminal.
239
240In addition to arranging for various Encode aliases the following functions and
241variables are provided:
242
243=over
244
245=item decode_argv( )
246
247=item decode_argv( Encode::FB_CROAK )
248
249This will decode the command line arguments to perl (the C<@ARGV> array) in-place.
250
251The function will by default replace characters that can't be decoded by
252"\x{FFFD}", the Unicode replacement character.
253
254Any argument provided is passed as CHECK to underlying Encode::decode() call.
255Pass the value C<Encode::FB_CROAK> to have the decoding croak if not all the
256command line arguments can be decoded.  See L<Encode/"Handling Malformed Data">
257for details on other options for CHECK.
258
259=item env( $uni_key )
260
261=item env( $uni_key => $uni_value )
262
263Interface to get/set environment variables.  Returns the current value as a
264Unicode string. The $uni_key and $uni_value arguments are expected to be
265Unicode strings as well.  Passing C<undef> as $uni_value deletes the
266environment variable named $uni_key.
267
268The returned value will have the characters that can't be decoded replaced by
269"\x{FFFD}", the Unicode replacement character.
270
271There is no interface to request alternative CHECK behavior as for
272decode_argv().  If you need that you need to call encode/decode yourself.
273For example:
274
275    my $key = Encode::encode(locale => $uni_key, Encode::FB_CROAK);
276    my $uni_value = Encode::decode(locale => $ENV{$key}, Encode::FB_CROAK);
277
278=item reinit( )
279
280=item reinit( $encoding )
281
282Reinitialize the encodings from the locale.  You want to call this function if
283you changed anything in the environment that might influence the locale.
284
285This function will croak if the determined encoding isn't recognized by
286the Encode module.
287
288With argument force $ENCODING_... variables to set to the given value.
289
290=item $ENCODING_LOCALE
291
292The encoding name determined to be suitable for the current locale.
293L<Encode> know this encoding as "locale".
294
295=item $ENCODING_LOCALE_FS
296
297The encoding name determined to be suitable for file system interfaces
298involving file names.
299L<Encode> know this encoding as "locale_fs".
300
301=item $ENCODING_CONSOLE_IN
302
303=item $ENCODING_CONSOLE_OUT
304
305The encodings to be used for reading and writing output to the a console.
306L<Encode> know these encodings as "console_in" and "console_out".
307
308=back
309
310=head1 NOTES
311
312This table summarizes the mapping of the encodings set up
313by the C<Encode::Locale> module:
314
315  Encode      |         |              |
316  Alias       | Windows | Mac OS X     | POSIX
317  ------------+---------+--------------+------------
318  locale      | ANSI    | nl_langinfo  | nl_langinfo
319  locale_fs   | ANSI    | UTF-8        | nl_langinfo
320  console_in  | OEM     | nl_langinfo  | nl_langinfo
321  console_out | OEM     | nl_langinfo  | nl_langinfo
322
323=head2 Windows
324
325Windows has basically 2 sets of APIs.  A wide API (based on passing UTF-16
326strings) and a byte based API based a character set called ANSI.  The
327regular Perl interfaces to the OS currently only uses the ANSI APIs.
328Unfortunately ANSI is not a single character set.
329
330The encoding that corresponds to ANSI varies between different editions of
331Windows.  For many western editions of Windows ANSI corresponds to CP-1252
332which is a character set similar to ISO-8859-1.  Conceptually the ANSI
333character set is a similar concept to the POSIX locale CODESET so this module
334figures out what the ANSI code page is and make this available as
335$ENCODING_LOCALE and the "locale" Encoding alias.
336
337Windows systems also operate with another byte based character set.
338It's called the OEM code page.  This is the encoding that the Console
339takes as input and output.  It's common for the OEM code page to
340differ from the ANSI code page.
341
342=head2 Mac OS X
343
344On Mac OS X the file system encoding is always UTF-8 while the locale
345can otherwise be set up as normal for POSIX systems.
346
347File names on Mac OS X will at the OS-level be converted to
348NFD-form.  A file created by passing a NFC-filename will come
349in NFD-form from readdir().  See L<Unicode::Normalize> for details
350of NFD/NFC.
351
352Actually, Apple does not follow the Unicode NFD standard since not all
353character ranges are decomposed.  The claim is that this avoids problems with
354round trip conversions from old Mac text encodings.  See L<Encode::UTF8Mac> for
355details.
356
357=head2 POSIX (Linux and other Unixes)
358
359File systems might vary in what encoding is to be used for
360filenames.  Since this module has no way to actually figure out
361what the is correct it goes with the best guess which is to
362assume filenames are encoding according to the current locale.
363Users are advised to always specify UTF-8 as the locale charset.
364
365=head1 SEE ALSO
366
367L<I18N::Langinfo>, L<Encode>, L<Term::Encoding>
368
369=head1 AUTHOR
370
371Copyright 2010 Gisle Aas <gisle@aas.no>.
372
373This library is free software; you can redistribute it and/or
374modify it under the same terms as Perl itself.
375
376=cut
377