xref: /openbsd-src/gnu/usr.bin/perl/cpan/Pod-Simple/t/encod04.t (revision 3d61058aa5c692477b6d18acfbbdb653a9930ff9)
1# The encoding detection heuristic will choose UTF8 or CP1252.  The current
2# implementation will usually treat CP1252 (aka "Win-Latin-1") as CP1252 but
3# can be fooled into seeing it as UTF8.
4
5use strict;
6use warnings;
7use Test::More tests => 5;
8
9# fail with the supplied diagnostic
10
11use Pod::Simple::DumpAsXML;
12use Pod::Simple::XMLOutStream;
13
14
15# Initial, isolated, non-ASCII byte triggers CP1252 guess and later
16# multi-byte sequence is not considered by heuristic.
17
18my $x97;
19my $x91;
20my $dash;
21if ($] ge 5.007_003) {
22    $x97 = chr utf8::unicode_to_native(0x97);
23    $x91 = chr utf8::unicode_to_native(0x91);
24    $dash = '&#8212';
25}
26else {  # Tests will fail for early EBCDICs
27    $x97 = chr 0x97;
28    $x91 = chr 0x91;
29    $dash = '--';
30}
31
32my @output_lines = split m/[\r\n]+/, Pod::Simple::XMLOutStream->_out( qq{
33
34=head1 NAME
35
36Em::Dash $x97 ${x91}CAF\xC9\x92
37
38=cut
39
40} );
41
42my($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
43if( $guess ) {
44  if( $guess eq 'CP1252' ) {
45    if( grep m{Dash $dash}, @output_lines ) {
46      ok 1;
47    } else {
48      fail "failed to find expected control character in output";
49    }
50  } else {
51    fail "parser guessed wrong encoding expected 'CP1252' got '$guess'";
52  }
53} else {
54  fail "parser failed to detect non-ASCII bytes in input";
55}
56
57
58# Initial smart-quote character triggers CP1252 guess as expected
59
60@output_lines = split m/[\r\n]+/, Pod::Simple::XMLOutStream->_out( qq{
61
62=head1 NAME
63
64Smart::Quote - ${x91}FUT\xC9\x92
65
66=cut
67
68} );
69
70if (ord("A") != 65) { # ASCII-platform dependent test skipped on this platform
71    ok (1);
72}
73else {
74    ($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
75    if( $guess ) {
76        if( $guess eq 'CP1252' ) {
77            ok 1;
78        } else {
79            fail "parser guessed wrong encoding expected 'CP1252' got '$guess'";
80        }
81    } else {
82        fail "parser failed to detect non-ASCII bytes in input";
83    }
84}
85
86
87# Initial accented character (E acute) followed by 'smart' apostrophe is legal
88# CP1252, which should be preferred over UTF-8 because the latter
89# interpretation would be "JOS" . \N{LATIN SMALL LETTER TURNED ALPHA} . "S
90# PLACE", and that \N{} letter is an IPA one.
91
92@output_lines = split m/[\r\n]+/, Pod::Simple::XMLOutStream->_out( qq{
93
94=head1 NAME
95
96=head2 JOS\xC9\x92S PLACE
97
98=cut
99
100} );
101
102if (ord("A") != 65) { # ASCII-platform dependent test skipped on this platform
103    ok (1);
104}
105else {
106    ($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
107    if( $guess ) {
108        if( $guess eq 'CP1252' ) {
109            ok 1;
110        } else {
111            fail "parser guessed wrong encoding expected 'CP1252' got '$guess'";
112        }
113    } else {
114        fail "parser failed to detect non-ASCII bytes in input";
115    }
116}
117
118
119# The previous example used a CP1252 byte sequence that also happened to be a
120# valid UTF8 byte sequence.  In this example we use an illegal UTF-8 sequence
121# (it needs a third byte), so must be 1252
122
123@output_lines = split m/[\r\n]+/, Pod::Simple::XMLOutStream->_out( qq{
124
125=head1 NAME
126
127Smart::Apostrophe::Fail - L\xE9\x92Strange
128
129=cut
130
131} );
132
133if (ord("A") != 65) { # ASCII-platform dependent test skipped on this platform
134    ok (1);
135}
136else {
137    ($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
138    if( $guess ) {
139        if( $guess eq 'CP1252' ) {
140            ok 1;
141        } else {
142            fail "parser guessed wrong encoding expected 'CP1252' got '$guess'";
143        }
144    } else {
145        fail "parser failed to detect non-ASCII bytes in input";
146    }
147}
148
149# The following is a real word example of something in CP1252 expressible in
150# UTF-8, but doesn't make sense in UTF-8, contributed by Bo Lindbergh.
151# Muvrarášša is a Sami word
152
153@output_lines = split m/[\r\n]+/, Pod::Simple::XMLOutStream->_out( qq{
154
155=head1 NAME
156
157Muvrar\xE1\x9A\x9Aa is a mountain in Norway
158
159=cut
160
161} );
162
163if (ord("A") != 65) { # ASCII-platform dependent test skipped on this platform
164    ok (1);
165}
166else {
167    ($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
168    if( $guess ) {
169        if( $guess eq 'CP1252' ) {
170            ok 1;
171        } else {
172            fail "parser guessed wrong encoding expected 'CP1252' got '$guess'";
173        }
174    } else {
175        fail "parser failed to detect non-ASCII bytes in input";
176    }
177}
178