xref: /openbsd-src/gnu/usr.bin/perl/dist/encoding-warnings/lib/encoding/warnings.pm (revision e068048151d29f2562a32185e21a8ba885482260)
1b8851fccSafresh1package encoding::warnings;
2*e0680481Safresh1$encoding::warnings::VERSION = '0.14';
3b8851fccSafresh1
4b8851fccSafresh1use strict;
5b8851fccSafresh1use 5.007;
6b8851fccSafresh1
7b8851fccSafresh1=head1 NAME
8b8851fccSafresh1
9b8851fccSafresh1encoding::warnings - Warn on implicit encoding conversions
10b8851fccSafresh1
11b8851fccSafresh1=head1 VERSION
12b8851fccSafresh1
135759b3d2Safresh1This document describes version 0.13 of encoding::warnings, released
145759b3d2Safresh1June 20, 2016.
155759b3d2Safresh1
165759b3d2Safresh1=head1 NOTICE
175759b3d2Safresh1
185759b3d2Safresh1As of Perl 5.26.0, this module has no effect.  The internal Perl feature
195759b3d2Safresh1that was used to implement this module has been removed.  In recent years,
205759b3d2Safresh1much work has been done on the Perl core to eliminate discrepancies in the
215759b3d2Safresh1treatment of upgraded versus downgraded strings.  In addition, the
225759b3d2Safresh1L<encoding> pragma, which caused many of the problems, is no longer
235759b3d2Safresh1supported.  Thus, the warnings this module produced are no longer
245759b3d2Safresh1necessary.
255759b3d2Safresh1
265759b3d2Safresh1Hence, if you load this module on Perl 5.26.0, you will get one warning
275759b3d2Safresh1that the module is no longer supported; and the module will do nothing
285759b3d2Safresh1thereafter.
29b8851fccSafresh1
30b8851fccSafresh1=head1 SYNOPSIS
31b8851fccSafresh1
32b8851fccSafresh1    use encoding::warnings; # or 'FATAL' to raise fatal exceptions
33b8851fccSafresh1
34b8851fccSafresh1    utf8::encode($a = chr(20000));  # a byte-string (raw bytes)
35b8851fccSafresh1    $b = chr(20000);                # a unicode-string (wide characters)
36b8851fccSafresh1
37b8851fccSafresh1    # "Bytes implicitly upgraded into wide characters as iso-8859-1"
38b8851fccSafresh1    $c = $a . $b;
39b8851fccSafresh1
40b8851fccSafresh1=head1 DESCRIPTION
41b8851fccSafresh1
42b8851fccSafresh1=head2 Overview of the problem
43b8851fccSafresh1
44b8851fccSafresh1By default, there is a fundamental asymmetry in Perl's unicode model:
45b8851fccSafresh1implicit upgrading from byte-strings to unicode-strings assumes that
46b8851fccSafresh1they were encoded in I<ISO 8859-1 (Latin-1)>, but unicode-strings are
47b8851fccSafresh1downgraded with UTF-8 encoding.  This happens because the first 256
48b8851fccSafresh1codepoints in Unicode happens to agree with Latin-1.
49b8851fccSafresh1
50b8851fccSafresh1However, this silent upgrading can easily cause problems, if you happen
51b8851fccSafresh1to mix unicode strings with non-Latin1 data -- i.e. byte-strings encoded
52b8851fccSafresh1in UTF-8 or other encodings.  The error will not manifest until the
53b8851fccSafresh1combined string is written to output, at which time it would be impossible
54b8851fccSafresh1to see where did the silent upgrading occur.
55b8851fccSafresh1
56b8851fccSafresh1=head2 Detecting the problem
57b8851fccSafresh1
58b8851fccSafresh1This module simplifies the process of diagnosing such problems.  Just put
59b8851fccSafresh1this line on top of your main program:
60b8851fccSafresh1
61b8851fccSafresh1    use encoding::warnings;
62b8851fccSafresh1
63b8851fccSafresh1Afterwards, implicit upgrading of high-bit bytes will raise a warning.
64b8851fccSafresh1Ex.: C<Bytes implicitly upgraded into wide characters as iso-8859-1 at
65b8851fccSafresh1- line 7>.
66b8851fccSafresh1
67b8851fccSafresh1However, strings composed purely of ASCII code points (C<0x00>..C<0x7F>)
68b8851fccSafresh1will I<not> trigger this warning.
69b8851fccSafresh1
70b8851fccSafresh1You can also make the warnings fatal by importing this module as:
71b8851fccSafresh1
72b8851fccSafresh1    use encoding::warnings 'FATAL';
73b8851fccSafresh1
74b8851fccSafresh1=head2 Solving the problem
75b8851fccSafresh1
76b8851fccSafresh1Most of the time, this warning occurs when a byte-string is concatenated
77b8851fccSafresh1with a unicode-string.  There are a number of ways to solve it:
78b8851fccSafresh1
79b8851fccSafresh1=over 4
80b8851fccSafresh1
81b8851fccSafresh1=item * Upgrade both sides to unicode-strings
82b8851fccSafresh1
83b8851fccSafresh1If your program does not need compatibility for Perl 5.6 and earlier,
84b8851fccSafresh1the recommended approach is to apply appropriate IO disciplines, so all
85b8851fccSafresh1data in your program become unicode-strings.  See L<encoding>, L<open> and
86b8851fccSafresh1L<perlfunc/binmode> for how.
87b8851fccSafresh1
88b8851fccSafresh1=item * Downgrade both sides to byte-strings
89b8851fccSafresh1
90b8851fccSafresh1The other way works too, especially if you are sure that all your data
91b8851fccSafresh1are under the same encoding, or if compatibility with older versions
92b8851fccSafresh1of Perl is desired.
93b8851fccSafresh1
94b8851fccSafresh1You may downgrade strings with C<Encode::encode> and C<utf8::encode>.
95b8851fccSafresh1See L<Encode> and L<utf8> for details.
96b8851fccSafresh1
97b8851fccSafresh1=item * Specify the encoding for implicit byte-string upgrading
98b8851fccSafresh1
99b8851fccSafresh1If you are confident that all byte-strings will be in a specific
100b8851fccSafresh1encoding like UTF-8, I<and> need not support older versions of Perl,
101b8851fccSafresh1use the C<encoding> pragma:
102b8851fccSafresh1
103b8851fccSafresh1    use encoding 'utf8';
104b8851fccSafresh1
105b8851fccSafresh1Similarly, this will silence warnings from this module, and preserve the
106b8851fccSafresh1default behaviour:
107b8851fccSafresh1
108b8851fccSafresh1    use encoding 'iso-8859-1';
109b8851fccSafresh1
110b8851fccSafresh1However, note that C<use encoding> actually had three distinct effects:
111b8851fccSafresh1
112b8851fccSafresh1=over 4
113b8851fccSafresh1
114b8851fccSafresh1=item * PerlIO layers for B<STDIN> and B<STDOUT>
115b8851fccSafresh1
116b8851fccSafresh1This is similar to what L<open> pragma does.
117b8851fccSafresh1
118b8851fccSafresh1=item * Literal conversions
119b8851fccSafresh1
120b8851fccSafresh1This turns I<all> literal string in your program into unicode-strings
121b8851fccSafresh1(equivalent to a C<use utf8>), by decoding them using the specified
122b8851fccSafresh1encoding.
123b8851fccSafresh1
124b8851fccSafresh1=item * Implicit upgrading for byte-strings
125b8851fccSafresh1
126b8851fccSafresh1This will silence warnings from this module, as shown above.
127b8851fccSafresh1
128b8851fccSafresh1=back
129b8851fccSafresh1
130b8851fccSafresh1Because literal conversions also work on empty strings, it may surprise
131b8851fccSafresh1some people:
132b8851fccSafresh1
133b8851fccSafresh1    use encoding 'big5';
134b8851fccSafresh1
135b8851fccSafresh1    my $byte_string = pack("C*", 0xA4, 0x40);
136b8851fccSafresh1    print length $a;    # 2 here.
137b8851fccSafresh1    $a .= "";           # concatenating with a unicode string...
138b8851fccSafresh1    print length $a;    # 1 here!
139b8851fccSafresh1
140b8851fccSafresh1In other words, do not C<use encoding> unless you are certain that the
141b8851fccSafresh1program will not deal with any raw, 8-bit binary data at all.
142b8851fccSafresh1
143b8851fccSafresh1However, the C<Filter =E<gt> 1> flavor of C<use encoding> will I<not>
144b8851fccSafresh1affect implicit upgrading for byte-strings, and is thus incapable of
145b8851fccSafresh1silencing warnings from this module.  See L<encoding> for more details.
146b8851fccSafresh1
147b8851fccSafresh1=back
148b8851fccSafresh1
149b8851fccSafresh1=head1 CAVEATS
150b8851fccSafresh1
151b8851fccSafresh1For Perl 5.9.4 or later, this module's effect is lexical.
152b8851fccSafresh1
153b8851fccSafresh1For Perl versions prior to 5.9.4, this module affects the whole script,
154b8851fccSafresh1instead of inside its lexical block.
155b8851fccSafresh1
156b8851fccSafresh1=cut
157b8851fccSafresh1
158b8851fccSafresh1# Constants.
159b8851fccSafresh1sub ASCII  () { 0 }
160b8851fccSafresh1sub LATIN1 () { 1 }
161b8851fccSafresh1sub FATAL  () { 2 }
162b8851fccSafresh1
163b8851fccSafresh1sub import {
1645759b3d2Safresh1    if ($] >= 5.025003) {
1655759b3d2Safresh1        require Carp;
1665759b3d2Safresh1        Carp::cluck(
1675759b3d2Safresh1            "encoding::warnings is not supported on Perl 5.26.0 and later"
1685759b3d2Safresh1        );
1695759b3d2Safresh1        return;
1705759b3d2Safresh1    }
1715759b3d2Safresh1
1725759b3d2Safresh1    # Install a ${^ENCODING} handler if no other one are already in place.
173b8851fccSafresh1    my $class = shift;
174b8851fccSafresh1    my $fatal = shift || '';
175b8851fccSafresh1
176b8851fccSafresh1    local $@;
177b8851fccSafresh1    return if ${^ENCODING} and ref(${^ENCODING}) ne $class;
178b8851fccSafresh1    return unless eval { require Encode; 1 };
179b8851fccSafresh1
180b8851fccSafresh1    my $ascii  = Encode::find_encoding('us-ascii') or return;
181b8851fccSafresh1    my $latin1 = Encode::find_encoding('iso-8859-1') or return;
182b8851fccSafresh1
183b8851fccSafresh1    # Have to undef explicitly here
184b8851fccSafresh1    undef ${^ENCODING};
185b8851fccSafresh1
186b8851fccSafresh1    # Install a warning handler for decode()
187b8851fccSafresh1    my $decoder = bless(
188b8851fccSafresh1        [
189b8851fccSafresh1            $ascii,
190b8851fccSafresh1            $latin1,
191b8851fccSafresh1            (($fatal eq 'FATAL') ? 'Carp::croak' : 'Carp::carp'),
192b8851fccSafresh1        ], $class,
193b8851fccSafresh1    );
194b8851fccSafresh1
195b8851fccSafresh1    no warnings 'deprecated';
196b8851fccSafresh1    ${^ENCODING} = $decoder;
197b8851fccSafresh1    use warnings 'deprecated';
198b8851fccSafresh1    $^H{$class} = 1;
199b8851fccSafresh1}
200b8851fccSafresh1
201b8851fccSafresh1sub unimport {
202b8851fccSafresh1    my $class = shift;
203b8851fccSafresh1    $^H{$class} = undef;
204b8851fccSafresh1    undef ${^ENCODING};
205b8851fccSafresh1}
206b8851fccSafresh1
207b8851fccSafresh1# Don't worry about source code literals.
208b8851fccSafresh1sub cat_decode {
209b8851fccSafresh1    my $self = shift;
210b8851fccSafresh1    return $self->[LATIN1]->cat_decode(@_);
211b8851fccSafresh1}
212b8851fccSafresh1
213b8851fccSafresh1# Warn if the data is not purely US-ASCII.
214b8851fccSafresh1sub decode {
215b8851fccSafresh1    my $self = shift;
216b8851fccSafresh1
217b8851fccSafresh1    DO_WARN: {
218b8851fccSafresh1        if ($] >= 5.009004) {
219b8851fccSafresh1            my $hints = (caller(0))[10];
220b8851fccSafresh1            $hints->{ref($self)} or last DO_WARN;
221b8851fccSafresh1        }
222b8851fccSafresh1
223b8851fccSafresh1        local $@;
224b8851fccSafresh1        my $rv = eval { $self->[ASCII]->decode($_[0], Encode::FB_CROAK()) };
225b8851fccSafresh1        return $rv unless $@;
226b8851fccSafresh1
227b8851fccSafresh1        require Carp;
228b8851fccSafresh1        no strict 'refs';
229b8851fccSafresh1        $self->[FATAL]->(
230b8851fccSafresh1            "Bytes implicitly upgraded into wide characters as iso-8859-1"
231b8851fccSafresh1        );
232b8851fccSafresh1
233b8851fccSafresh1    }
234b8851fccSafresh1
235b8851fccSafresh1    return $self->[LATIN1]->decode(@_);
236b8851fccSafresh1}
237b8851fccSafresh1
238b8851fccSafresh1sub name { 'iso-8859-1' }
239b8851fccSafresh1
240b8851fccSafresh11;
241b8851fccSafresh1
242b8851fccSafresh1__END__
243b8851fccSafresh1
244b8851fccSafresh1=head1 SEE ALSO
245b8851fccSafresh1
246b8851fccSafresh1L<perlunicode>, L<perluniintro>
247b8851fccSafresh1
248b8851fccSafresh1L<open>, L<utf8>, L<encoding>, L<Encode>
249b8851fccSafresh1
250b8851fccSafresh1=head1 AUTHORS
251b8851fccSafresh1
252b8851fccSafresh1Audrey Tang
253b8851fccSafresh1
254b8851fccSafresh1=head1 COPYRIGHT
255b8851fccSafresh1
256b8851fccSafresh1Copyright 2004, 2005, 2006, 2007 by Audrey Tang E<lt>cpan@audreyt.orgE<gt>.
257b8851fccSafresh1
258b8851fccSafresh1This program is free software; you can redistribute it and/or modify it
259b8851fccSafresh1under the same terms as Perl itself.
260b8851fccSafresh1
261b8851fccSafresh1See L<http://www.perl.com/perl/misc/Artistic.html>
262b8851fccSafresh1
263b8851fccSafresh1=cut
264