xref: /openbsd-src/gnu/usr.bin/perl/dist/encoding-warnings/lib/encoding/warnings.pm (revision e068048151d29f2562a32185e21a8ba885482260)
1package encoding::warnings;
2$encoding::warnings::VERSION = '0.14';
3
4use strict;
5use 5.007;
6
7=head1 NAME
8
9encoding::warnings - Warn on implicit encoding conversions
10
11=head1 VERSION
12
13This document describes version 0.13 of encoding::warnings, released
14June 20, 2016.
15
16=head1 NOTICE
17
18As of Perl 5.26.0, this module has no effect.  The internal Perl feature
19that was used to implement this module has been removed.  In recent years,
20much work has been done on the Perl core to eliminate discrepancies in the
21treatment of upgraded versus downgraded strings.  In addition, the
22L<encoding> pragma, which caused many of the problems, is no longer
23supported.  Thus, the warnings this module produced are no longer
24necessary.
25
26Hence, if you load this module on Perl 5.26.0, you will get one warning
27that the module is no longer supported; and the module will do nothing
28thereafter.
29
30=head1 SYNOPSIS
31
32    use encoding::warnings; # or 'FATAL' to raise fatal exceptions
33
34    utf8::encode($a = chr(20000));  # a byte-string (raw bytes)
35    $b = chr(20000);                # a unicode-string (wide characters)
36
37    # "Bytes implicitly upgraded into wide characters as iso-8859-1"
38    $c = $a . $b;
39
40=head1 DESCRIPTION
41
42=head2 Overview of the problem
43
44By default, there is a fundamental asymmetry in Perl's unicode model:
45implicit upgrading from byte-strings to unicode-strings assumes that
46they were encoded in I<ISO 8859-1 (Latin-1)>, but unicode-strings are
47downgraded with UTF-8 encoding.  This happens because the first 256
48codepoints in Unicode happens to agree with Latin-1.
49
50However, this silent upgrading can easily cause problems, if you happen
51to mix unicode strings with non-Latin1 data -- i.e. byte-strings encoded
52in UTF-8 or other encodings.  The error will not manifest until the
53combined string is written to output, at which time it would be impossible
54to see where did the silent upgrading occur.
55
56=head2 Detecting the problem
57
58This module simplifies the process of diagnosing such problems.  Just put
59this line on top of your main program:
60
61    use encoding::warnings;
62
63Afterwards, implicit upgrading of high-bit bytes will raise a warning.
64Ex.: C<Bytes implicitly upgraded into wide characters as iso-8859-1 at
65- line 7>.
66
67However, strings composed purely of ASCII code points (C<0x00>..C<0x7F>)
68will I<not> trigger this warning.
69
70You can also make the warnings fatal by importing this module as:
71
72    use encoding::warnings 'FATAL';
73
74=head2 Solving the problem
75
76Most of the time, this warning occurs when a byte-string is concatenated
77with a unicode-string.  There are a number of ways to solve it:
78
79=over 4
80
81=item * Upgrade both sides to unicode-strings
82
83If your program does not need compatibility for Perl 5.6 and earlier,
84the recommended approach is to apply appropriate IO disciplines, so all
85data in your program become unicode-strings.  See L<encoding>, L<open> and
86L<perlfunc/binmode> for how.
87
88=item * Downgrade both sides to byte-strings
89
90The other way works too, especially if you are sure that all your data
91are under the same encoding, or if compatibility with older versions
92of Perl is desired.
93
94You may downgrade strings with C<Encode::encode> and C<utf8::encode>.
95See L<Encode> and L<utf8> for details.
96
97=item * Specify the encoding for implicit byte-string upgrading
98
99If you are confident that all byte-strings will be in a specific
100encoding like UTF-8, I<and> need not support older versions of Perl,
101use the C<encoding> pragma:
102
103    use encoding 'utf8';
104
105Similarly, this will silence warnings from this module, and preserve the
106default behaviour:
107
108    use encoding 'iso-8859-1';
109
110However, note that C<use encoding> actually had three distinct effects:
111
112=over 4
113
114=item * PerlIO layers for B<STDIN> and B<STDOUT>
115
116This is similar to what L<open> pragma does.
117
118=item * Literal conversions
119
120This turns I<all> literal string in your program into unicode-strings
121(equivalent to a C<use utf8>), by decoding them using the specified
122encoding.
123
124=item * Implicit upgrading for byte-strings
125
126This will silence warnings from this module, as shown above.
127
128=back
129
130Because literal conversions also work on empty strings, it may surprise
131some people:
132
133    use encoding 'big5';
134
135    my $byte_string = pack("C*", 0xA4, 0x40);
136    print length $a;    # 2 here.
137    $a .= "";           # concatenating with a unicode string...
138    print length $a;    # 1 here!
139
140In other words, do not C<use encoding> unless you are certain that the
141program will not deal with any raw, 8-bit binary data at all.
142
143However, the C<Filter =E<gt> 1> flavor of C<use encoding> will I<not>
144affect implicit upgrading for byte-strings, and is thus incapable of
145silencing warnings from this module.  See L<encoding> for more details.
146
147=back
148
149=head1 CAVEATS
150
151For Perl 5.9.4 or later, this module's effect is lexical.
152
153For Perl versions prior to 5.9.4, this module affects the whole script,
154instead of inside its lexical block.
155
156=cut
157
158# Constants.
159sub ASCII  () { 0 }
160sub LATIN1 () { 1 }
161sub FATAL  () { 2 }
162
163sub import {
164    if ($] >= 5.025003) {
165        require Carp;
166        Carp::cluck(
167            "encoding::warnings is not supported on Perl 5.26.0 and later"
168        );
169        return;
170    }
171
172    # Install a ${^ENCODING} handler if no other one are already in place.
173    my $class = shift;
174    my $fatal = shift || '';
175
176    local $@;
177    return if ${^ENCODING} and ref(${^ENCODING}) ne $class;
178    return unless eval { require Encode; 1 };
179
180    my $ascii  = Encode::find_encoding('us-ascii') or return;
181    my $latin1 = Encode::find_encoding('iso-8859-1') or return;
182
183    # Have to undef explicitly here
184    undef ${^ENCODING};
185
186    # Install a warning handler for decode()
187    my $decoder = bless(
188        [
189            $ascii,
190            $latin1,
191            (($fatal eq 'FATAL') ? 'Carp::croak' : 'Carp::carp'),
192        ], $class,
193    );
194
195    no warnings 'deprecated';
196    ${^ENCODING} = $decoder;
197    use warnings 'deprecated';
198    $^H{$class} = 1;
199}
200
201sub unimport {
202    my $class = shift;
203    $^H{$class} = undef;
204    undef ${^ENCODING};
205}
206
207# Don't worry about source code literals.
208sub cat_decode {
209    my $self = shift;
210    return $self->[LATIN1]->cat_decode(@_);
211}
212
213# Warn if the data is not purely US-ASCII.
214sub decode {
215    my $self = shift;
216
217    DO_WARN: {
218        if ($] >= 5.009004) {
219            my $hints = (caller(0))[10];
220            $hints->{ref($self)} or last DO_WARN;
221        }
222
223        local $@;
224        my $rv = eval { $self->[ASCII]->decode($_[0], Encode::FB_CROAK()) };
225        return $rv unless $@;
226
227        require Carp;
228        no strict 'refs';
229        $self->[FATAL]->(
230            "Bytes implicitly upgraded into wide characters as iso-8859-1"
231        );
232
233    }
234
235    return $self->[LATIN1]->decode(@_);
236}
237
238sub name { 'iso-8859-1' }
239
2401;
241
242__END__
243
244=head1 SEE ALSO
245
246L<perlunicode>, L<perluniintro>
247
248L<open>, L<utf8>, L<encoding>, L<Encode>
249
250=head1 AUTHORS
251
252Audrey Tang
253
254=head1 COPYRIGHT
255
256Copyright 2004, 2005, 2006, 2007 by Audrey Tang E<lt>cpan@audreyt.orgE<gt>.
257
258This program is free software; you can redistribute it and/or modify it
259under the same terms as Perl itself.
260
261See L<http://www.perl.com/perl/misc/Artistic.html>
262
263=cut
264