xref: /openbsd-src/gnu/usr.bin/perl/cpan/ExtUtils-MakeMaker/lib/ExtUtils/MakeMaker/Locale.pm (revision e068048151d29f2562a32185e21a8ba885482260)
1b8851fccSafresh1package ExtUtils::MakeMaker::Locale;
2b8851fccSafresh1
3b8851fccSafresh1use strict;
4eac174f2Safresh1use warnings;
5*e0680481Safresh1our $VERSION = "7.70";
656d68f1eSafresh1$VERSION =~ tr/_//d;
7b8851fccSafresh1
8b8851fccSafresh1use base 'Exporter';
9b8851fccSafresh1our @EXPORT_OK = qw(
10b8851fccSafresh1    decode_argv env
11b8851fccSafresh1    $ENCODING_LOCALE $ENCODING_LOCALE_FS
12b8851fccSafresh1    $ENCODING_CONSOLE_IN $ENCODING_CONSOLE_OUT
13b8851fccSafresh1);
14b8851fccSafresh1
15b8851fccSafresh1use Encode ();
16b8851fccSafresh1use Encode::Alias ();
17b8851fccSafresh1
18b8851fccSafresh1our $ENCODING_LOCALE;
19b8851fccSafresh1our $ENCODING_LOCALE_FS;
20b8851fccSafresh1our $ENCODING_CONSOLE_IN;
21b8851fccSafresh1our $ENCODING_CONSOLE_OUT;
22b8851fccSafresh1
23b8851fccSafresh1sub DEBUG () { 0 }
24b8851fccSafresh1
25b8851fccSafresh1sub _init {
26b8851fccSafresh1    if ($^O eq "MSWin32") {
27b8851fccSafresh1	unless ($ENCODING_LOCALE) {
28b8851fccSafresh1	    # Try to obtain what the Windows ANSI code page is
29b8851fccSafresh1	    eval {
30b8851fccSafresh1		unless (defined &GetConsoleCP) {
31b8851fccSafresh1		    require Win32;
32b8851fccSafresh1                    # manually "import" it since Win32->import refuses
335759b3d2Safresh1		    *GetConsoleCP = sub { &Win32::GetConsoleCP } if defined &Win32::GetConsoleCP;
34b8851fccSafresh1		}
35b8851fccSafresh1		unless (defined &GetConsoleCP) {
36b8851fccSafresh1		    require Win32::API;
37b8851fccSafresh1		    Win32::API->Import('kernel32', 'int GetConsoleCP()');
38b8851fccSafresh1		}
39b8851fccSafresh1		if (defined &GetConsoleCP) {
40b8851fccSafresh1		    my $cp = GetConsoleCP();
41b8851fccSafresh1		    $ENCODING_LOCALE = "cp$cp" if $cp;
42b8851fccSafresh1		}
43b8851fccSafresh1	    };
44b8851fccSafresh1	}
45b8851fccSafresh1
46b8851fccSafresh1	unless ($ENCODING_CONSOLE_IN) {
47b8851fccSafresh1            # only test one since set together
48b8851fccSafresh1            unless (defined &GetInputCP) {
49b8851fccSafresh1                eval {
50b8851fccSafresh1                    require Win32;
5156d68f1eSafresh1                    eval {
5256d68f1eSafresh1                        local $SIG{__WARN__} = sub {} if ( "$]" < 5.014 ); # suppress deprecation warning for inherited AUTOLOAD of Win32::GetConsoleCP()
5356d68f1eSafresh1                        Win32::GetConsoleCP();
5456d68f1eSafresh1                    };
55b8851fccSafresh1                    # manually "import" it since Win32->import refuses
565759b3d2Safresh1                    *GetInputCP = sub { &Win32::GetConsoleCP } if defined &Win32::GetConsoleCP;
575759b3d2Safresh1                    *GetOutputCP = sub { &Win32::GetConsoleOutputCP } if defined &Win32::GetConsoleOutputCP;
58b8851fccSafresh1                };
59b8851fccSafresh1                unless (defined &GetInputCP) {
60b8851fccSafresh1                    eval {
61b8851fccSafresh1                        # try Win32::Console module for codepage to use
62b8851fccSafresh1                        require Win32::Console;
63b8851fccSafresh1                        *GetInputCP = sub { &Win32::Console::InputCP }
645759b3d2Safresh1                            if defined &Win32::Console::InputCP;
65b8851fccSafresh1                        *GetOutputCP = sub { &Win32::Console::OutputCP }
665759b3d2Safresh1                            if defined &Win32::Console::OutputCP;
67b8851fccSafresh1                    };
68b8851fccSafresh1                }
69b8851fccSafresh1                unless (defined &GetInputCP) {
70b8851fccSafresh1                    # final fallback
71b8851fccSafresh1                    *GetInputCP = *GetOutputCP = sub {
72b8851fccSafresh1                        # another fallback that could work is:
73b8851fccSafresh1                        # reg query HKLM\System\CurrentControlSet\Control\Nls\CodePage /v ACP
74b8851fccSafresh1                        ((qx(chcp) || '') =~ /^Active code page: (\d+)/)
75b8851fccSafresh1                            ? $1 : ();
76b8851fccSafresh1                    };
77b8851fccSafresh1                }
78b8851fccSafresh1	    }
79b8851fccSafresh1            my $cp = GetInputCP();
80b8851fccSafresh1            $ENCODING_CONSOLE_IN = "cp$cp" if $cp;
81b8851fccSafresh1            $cp = GetOutputCP();
82b8851fccSafresh1            $ENCODING_CONSOLE_OUT = "cp$cp" if $cp;
83b8851fccSafresh1	}
84b8851fccSafresh1    }
85b8851fccSafresh1
86b8851fccSafresh1    unless ($ENCODING_LOCALE) {
87b8851fccSafresh1	eval {
88b8851fccSafresh1	    require I18N::Langinfo;
89b8851fccSafresh1	    $ENCODING_LOCALE = I18N::Langinfo::langinfo(I18N::Langinfo::CODESET());
90b8851fccSafresh1
91b8851fccSafresh1	    # Workaround of Encode < v2.25.  The "646" encoding  alias was
92b8851fccSafresh1	    # introduced in Encode-2.25, but we don't want to require that version
93b8851fccSafresh1	    # quite yet.  Should avoid the CPAN testers failure reported from
94b8851fccSafresh1	    # openbsd-4.7/perl-5.10.0 combo.
95b8851fccSafresh1	    $ENCODING_LOCALE = "ascii" if $ENCODING_LOCALE eq "646";
96b8851fccSafresh1
97b8851fccSafresh1	    # https://rt.cpan.org/Ticket/Display.html?id=66373
98b8851fccSafresh1	    $ENCODING_LOCALE = "hp-roman8" if $^O eq "hpux" && $ENCODING_LOCALE eq "roman8";
99b8851fccSafresh1	};
100b8851fccSafresh1	$ENCODING_LOCALE ||= $ENCODING_CONSOLE_IN;
101b8851fccSafresh1    }
102b8851fccSafresh1
10356d68f1eSafresh1    # Workaround of Encode < v2.71 for "cp65000" and "cp65001"
10456d68f1eSafresh1    # The "cp65000" and "cp65001" aliases were added in [Encode v2.71](https://github.com/dankogai/p5-encode/commit/7874bd95aa10967a3b5dbae333d16bcd703ac6c6)
10556d68f1eSafresh1    # via commit <https://github.com/dankogai/p5-encode/commit/84b9c1101d5251d37e226f80d1c6781718779047>.
10656d68f1eSafresh1    # This will avoid test failures for Win32 machines using the UTF-7 or UTF-8 code pages.
10756d68f1eSafresh1    $ENCODING_LOCALE = 'UTF-7' if $ENCODING_LOCALE && lc($ENCODING_LOCALE) eq "cp65000";
10856d68f1eSafresh1    $ENCODING_LOCALE = 'utf-8-strict' if $ENCODING_LOCALE && lc($ENCODING_LOCALE) eq "cp65001";
10956d68f1eSafresh1
110b8851fccSafresh1    if ($^O eq "darwin") {
111b8851fccSafresh1	$ENCODING_LOCALE_FS ||= "UTF-8";
112b8851fccSafresh1    }
113b8851fccSafresh1
114b8851fccSafresh1    # final fallback
115b8851fccSafresh1    $ENCODING_LOCALE ||= $^O eq "MSWin32" ? "cp1252" : "UTF-8";
116b8851fccSafresh1    $ENCODING_LOCALE_FS ||= $ENCODING_LOCALE;
117b8851fccSafresh1    $ENCODING_CONSOLE_IN ||= $ENCODING_LOCALE;
118b8851fccSafresh1    $ENCODING_CONSOLE_OUT ||= $ENCODING_CONSOLE_IN;
119b8851fccSafresh1
120b8851fccSafresh1    unless (Encode::find_encoding($ENCODING_LOCALE)) {
121b8851fccSafresh1	my $foundit;
122b8851fccSafresh1	if (lc($ENCODING_LOCALE) eq "gb18030") {
123b8851fccSafresh1	    eval {
124b8851fccSafresh1		require Encode::HanExtra;
125b8851fccSafresh1	    };
126b8851fccSafresh1	    if ($@) {
127b8851fccSafresh1		die "Need Encode::HanExtra to be installed to support locale codeset ($ENCODING_LOCALE), stopped";
128b8851fccSafresh1	    }
129b8851fccSafresh1	    $foundit++ if Encode::find_encoding($ENCODING_LOCALE);
130b8851fccSafresh1	}
131b8851fccSafresh1	die "The locale codeset ($ENCODING_LOCALE) isn't one that perl can decode, stopped"
132b8851fccSafresh1	    unless $foundit;
133b8851fccSafresh1
134b8851fccSafresh1    }
135b8851fccSafresh1
136b8851fccSafresh1    # use Data::Dump; ddx $ENCODING_LOCALE, $ENCODING_LOCALE_FS, $ENCODING_CONSOLE_IN, $ENCODING_CONSOLE_OUT;
137b8851fccSafresh1}
138b8851fccSafresh1
139b8851fccSafresh1_init();
140b8851fccSafresh1Encode::Alias::define_alias(sub {
141b8851fccSafresh1    no strict 'refs';
142b8851fccSafresh1    no warnings 'once';
143b8851fccSafresh1    return ${"ENCODING_" . uc(shift)};
144b8851fccSafresh1}, "locale");
145b8851fccSafresh1
146b8851fccSafresh1sub _flush_aliases {
147b8851fccSafresh1    no strict 'refs';
1485759b3d2Safresh1    for my $a (sort keys %Encode::Alias::Alias) {
149b8851fccSafresh1	if (defined ${"ENCODING_" . uc($a)}) {
150b8851fccSafresh1	    delete $Encode::Alias::Alias{$a};
151b8851fccSafresh1	    warn "Flushed alias cache for $a" if DEBUG;
152b8851fccSafresh1	}
153b8851fccSafresh1    }
154b8851fccSafresh1}
155b8851fccSafresh1
156b8851fccSafresh1sub reinit {
157b8851fccSafresh1    $ENCODING_LOCALE = shift;
158b8851fccSafresh1    $ENCODING_LOCALE_FS = shift;
159b8851fccSafresh1    $ENCODING_CONSOLE_IN = $ENCODING_LOCALE;
160b8851fccSafresh1    $ENCODING_CONSOLE_OUT = $ENCODING_LOCALE;
161b8851fccSafresh1    _init();
162b8851fccSafresh1    _flush_aliases();
163b8851fccSafresh1}
164b8851fccSafresh1
165b8851fccSafresh1sub decode_argv {
166b8851fccSafresh1    die if defined wantarray;
167b8851fccSafresh1    for (@ARGV) {
168b8851fccSafresh1	$_ = Encode::decode(locale => $_, @_);
169b8851fccSafresh1    }
170b8851fccSafresh1}
171b8851fccSafresh1
172b8851fccSafresh1sub env {
173b8851fccSafresh1    my $k = Encode::encode(locale => shift);
174b8851fccSafresh1    my $old = $ENV{$k};
175b8851fccSafresh1    if (@_) {
176b8851fccSafresh1	my $v = shift;
177b8851fccSafresh1	if (defined $v) {
178b8851fccSafresh1	    $ENV{$k} = Encode::encode(locale => $v);
179b8851fccSafresh1	}
180b8851fccSafresh1	else {
181b8851fccSafresh1	    delete $ENV{$k};
182b8851fccSafresh1	}
183b8851fccSafresh1    }
184b8851fccSafresh1    return Encode::decode(locale => $old) if defined wantarray;
185b8851fccSafresh1}
186b8851fccSafresh1
187b8851fccSafresh11;
188b8851fccSafresh1
189b8851fccSafresh1__END__
190b8851fccSafresh1
191b8851fccSafresh1=head1 NAME
192b8851fccSafresh1
193b8851fccSafresh1ExtUtils::MakeMaker::Locale - bundled Encode::Locale
194b8851fccSafresh1
195b8851fccSafresh1=head1 SYNOPSIS
196b8851fccSafresh1
197b8851fccSafresh1  use Encode::Locale;
198b8851fccSafresh1  use Encode;
199b8851fccSafresh1
200b8851fccSafresh1  $string = decode(locale => $bytes);
201b8851fccSafresh1  $bytes = encode(locale => $string);
202b8851fccSafresh1
203b8851fccSafresh1  if (-t) {
204b8851fccSafresh1      binmode(STDIN, ":encoding(console_in)");
205b8851fccSafresh1      binmode(STDOUT, ":encoding(console_out)");
206b8851fccSafresh1      binmode(STDERR, ":encoding(console_out)");
207b8851fccSafresh1  }
208b8851fccSafresh1
209b8851fccSafresh1  # Processing file names passed in as arguments
210b8851fccSafresh1  my $uni_filename = decode(locale => $ARGV[0]);
211b8851fccSafresh1  open(my $fh, "<", encode(locale_fs => $uni_filename))
212b8851fccSafresh1     || die "Can't open '$uni_filename': $!";
213b8851fccSafresh1  binmode($fh, ":encoding(locale)");
214b8851fccSafresh1  ...
215b8851fccSafresh1
216b8851fccSafresh1=head1 DESCRIPTION
217b8851fccSafresh1
218b8851fccSafresh1In many applications it's wise to let Perl use Unicode for the strings it
219b8851fccSafresh1processes.  Most of the interfaces Perl has to the outside world are still byte
220b8851fccSafresh1based.  Programs therefore need to decode byte strings that enter the program
221b8851fccSafresh1from the outside and encode them again on the way out.
222b8851fccSafresh1
223b8851fccSafresh1The POSIX locale system is used to specify both the language conventions
224b8851fccSafresh1requested by the user and the preferred character set to consume and
225b8851fccSafresh1output.  The C<Encode::Locale> module looks up the charset and encoding (called
226b8851fccSafresh1a CODESET in the locale jargon) and arranges for the L<Encode> module to know
227b8851fccSafresh1this encoding under the name "locale".  It means bytes obtained from the
228b8851fccSafresh1environment can be converted to Unicode strings by calling C<<
229b8851fccSafresh1Encode::encode(locale => $bytes) >> and converted back again with C<<
230b8851fccSafresh1Encode::decode(locale => $string) >>.
231b8851fccSafresh1
232b8851fccSafresh1Where file systems interfaces pass file names in and out of the program we also
233b8851fccSafresh1need care.  The trend is for operating systems to use a fixed file encoding
234b8851fccSafresh1that don't actually depend on the locale; and this module determines the most
235b8851fccSafresh1appropriate encoding for file names. The L<Encode> module will know this
236b8851fccSafresh1encoding under the name "locale_fs".  For traditional Unix systems this will
237b8851fccSafresh1be an alias to the same encoding as "locale".
238b8851fccSafresh1
239b8851fccSafresh1For programs running in a terminal window (called a "Console" on some systems)
240b8851fccSafresh1the "locale" encoding is usually a good choice for what to expect as input and
241b8851fccSafresh1output.  Some systems allows us to query the encoding set for the terminal and
242b8851fccSafresh1C<Encode::Locale> will do that if available and make these encodings known
243b8851fccSafresh1under the C<Encode> aliases "console_in" and "console_out".  For systems where
244b8851fccSafresh1we can't determine the terminal encoding these will be aliased as the same
245b8851fccSafresh1encoding as "locale".  The advice is to use "console_in" for input known to
246b8851fccSafresh1come from the terminal and "console_out" for output to the terminal.
247b8851fccSafresh1
248b8851fccSafresh1In addition to arranging for various Encode aliases the following functions and
249b8851fccSafresh1variables are provided:
250b8851fccSafresh1
251b8851fccSafresh1=over
252b8851fccSafresh1
253b8851fccSafresh1=item decode_argv( )
254b8851fccSafresh1
255b8851fccSafresh1=item decode_argv( Encode::FB_CROAK )
256b8851fccSafresh1
257b8851fccSafresh1This will decode the command line arguments to perl (the C<@ARGV> array) in-place.
258b8851fccSafresh1
259b8851fccSafresh1The function will by default replace characters that can't be decoded by
260b8851fccSafresh1"\x{FFFD}", the Unicode replacement character.
261b8851fccSafresh1
262b8851fccSafresh1Any argument provided is passed as CHECK to underlying Encode::decode() call.
263b8851fccSafresh1Pass the value C<Encode::FB_CROAK> to have the decoding croak if not all the
264b8851fccSafresh1command line arguments can be decoded.  See L<Encode/"Handling Malformed Data">
265b8851fccSafresh1for details on other options for CHECK.
266b8851fccSafresh1
267b8851fccSafresh1=item env( $uni_key )
268b8851fccSafresh1
269b8851fccSafresh1=item env( $uni_key => $uni_value )
270b8851fccSafresh1
271b8851fccSafresh1Interface to get/set environment variables.  Returns the current value as a
272b8851fccSafresh1Unicode string. The $uni_key and $uni_value arguments are expected to be
273b8851fccSafresh1Unicode strings as well.  Passing C<undef> as $uni_value deletes the
274b8851fccSafresh1environment variable named $uni_key.
275b8851fccSafresh1
276b8851fccSafresh1The returned value will have the characters that can't be decoded replaced by
277b8851fccSafresh1"\x{FFFD}", the Unicode replacement character.
278b8851fccSafresh1
279b8851fccSafresh1There is no interface to request alternative CHECK behavior as for
280b8851fccSafresh1decode_argv().  If you need that you need to call encode/decode yourself.
281b8851fccSafresh1For example:
282b8851fccSafresh1
283b8851fccSafresh1    my $key = Encode::encode(locale => $uni_key, Encode::FB_CROAK);
284b8851fccSafresh1    my $uni_value = Encode::decode(locale => $ENV{$key}, Encode::FB_CROAK);
285b8851fccSafresh1
286b8851fccSafresh1=item reinit( )
287b8851fccSafresh1
288b8851fccSafresh1=item reinit( $encoding )
289b8851fccSafresh1
290b8851fccSafresh1Reinitialize the encodings from the locale.  You want to call this function if
291b8851fccSafresh1you changed anything in the environment that might influence the locale.
292b8851fccSafresh1
293b8851fccSafresh1This function will croak if the determined encoding isn't recognized by
294b8851fccSafresh1the Encode module.
295b8851fccSafresh1
296b8851fccSafresh1With argument force $ENCODING_... variables to set to the given value.
297b8851fccSafresh1
298b8851fccSafresh1=item $ENCODING_LOCALE
299b8851fccSafresh1
300b8851fccSafresh1The encoding name determined to be suitable for the current locale.
301b8851fccSafresh1L<Encode> know this encoding as "locale".
302b8851fccSafresh1
303b8851fccSafresh1=item $ENCODING_LOCALE_FS
304b8851fccSafresh1
305b8851fccSafresh1The encoding name determined to be suitable for file system interfaces
306b8851fccSafresh1involving file names.
307b8851fccSafresh1L<Encode> know this encoding as "locale_fs".
308b8851fccSafresh1
309b8851fccSafresh1=item $ENCODING_CONSOLE_IN
310b8851fccSafresh1
311b8851fccSafresh1=item $ENCODING_CONSOLE_OUT
312b8851fccSafresh1
313b8851fccSafresh1The encodings to be used for reading and writing output to the a console.
314b8851fccSafresh1L<Encode> know these encodings as "console_in" and "console_out".
315b8851fccSafresh1
316b8851fccSafresh1=back
317b8851fccSafresh1
318b8851fccSafresh1=head1 NOTES
319b8851fccSafresh1
320b8851fccSafresh1This table summarizes the mapping of the encodings set up
321b8851fccSafresh1by the C<Encode::Locale> module:
322b8851fccSafresh1
323b8851fccSafresh1  Encode      |         |              |
324b8851fccSafresh1  Alias       | Windows | Mac OS X     | POSIX
325b8851fccSafresh1  ------------+---------+--------------+------------
326b8851fccSafresh1  locale      | ANSI    | nl_langinfo  | nl_langinfo
327b8851fccSafresh1  locale_fs   | ANSI    | UTF-8        | nl_langinfo
328b8851fccSafresh1  console_in  | OEM     | nl_langinfo  | nl_langinfo
329b8851fccSafresh1  console_out | OEM     | nl_langinfo  | nl_langinfo
330b8851fccSafresh1
331b8851fccSafresh1=head2 Windows
332b8851fccSafresh1
333b8851fccSafresh1Windows has basically 2 sets of APIs.  A wide API (based on passing UTF-16
334b8851fccSafresh1strings) and a byte based API based a character set called ANSI.  The
335b8851fccSafresh1regular Perl interfaces to the OS currently only uses the ANSI APIs.
336b8851fccSafresh1Unfortunately ANSI is not a single character set.
337b8851fccSafresh1
338b8851fccSafresh1The encoding that corresponds to ANSI varies between different editions of
339b8851fccSafresh1Windows.  For many western editions of Windows ANSI corresponds to CP-1252
340b8851fccSafresh1which is a character set similar to ISO-8859-1.  Conceptually the ANSI
341b8851fccSafresh1character set is a similar concept to the POSIX locale CODESET so this module
342b8851fccSafresh1figures out what the ANSI code page is and make this available as
343b8851fccSafresh1$ENCODING_LOCALE and the "locale" Encoding alias.
344b8851fccSafresh1
345b8851fccSafresh1Windows systems also operate with another byte based character set.
346b8851fccSafresh1It's called the OEM code page.  This is the encoding that the Console
347b8851fccSafresh1takes as input and output.  It's common for the OEM code page to
348b8851fccSafresh1differ from the ANSI code page.
349b8851fccSafresh1
350b8851fccSafresh1=head2 Mac OS X
351b8851fccSafresh1
352b8851fccSafresh1On Mac OS X the file system encoding is always UTF-8 while the locale
353b8851fccSafresh1can otherwise be set up as normal for POSIX systems.
354b8851fccSafresh1
355b8851fccSafresh1File names on Mac OS X will at the OS-level be converted to
356b8851fccSafresh1NFD-form.  A file created by passing a NFC-filename will come
357b8851fccSafresh1in NFD-form from readdir().  See L<Unicode::Normalize> for details
358b8851fccSafresh1of NFD/NFC.
359b8851fccSafresh1
360b8851fccSafresh1Actually, Apple does not follow the Unicode NFD standard since not all
361b8851fccSafresh1character ranges are decomposed.  The claim is that this avoids problems with
362b8851fccSafresh1round trip conversions from old Mac text encodings.  See L<Encode::UTF8Mac> for
363b8851fccSafresh1details.
364b8851fccSafresh1
365b8851fccSafresh1=head2 POSIX (Linux and other Unixes)
366b8851fccSafresh1
367b8851fccSafresh1File systems might vary in what encoding is to be used for
368b8851fccSafresh1filenames.  Since this module has no way to actually figure out
369b8851fccSafresh1what the is correct it goes with the best guess which is to
370b8851fccSafresh1assume filenames are encoding according to the current locale.
371b8851fccSafresh1Users are advised to always specify UTF-8 as the locale charset.
372b8851fccSafresh1
373b8851fccSafresh1=head1 SEE ALSO
374b8851fccSafresh1
375b8851fccSafresh1L<I18N::Langinfo>, L<Encode>, L<Term::Encoding>
376b8851fccSafresh1
377b8851fccSafresh1=head1 AUTHOR
378b8851fccSafresh1
379b8851fccSafresh1Copyright 2010 Gisle Aas <gisle@aas.no>.
380b8851fccSafresh1
381b8851fccSafresh1This library is free software; you can redistribute it and/or
382b8851fccSafresh1modify it under the same terms as Perl itself.
383b8851fccSafresh1
384b8851fccSafresh1=cut
385