1# The encoding detection heuristic will choose UTF8 or CP1252. The current 2# implementation will usually treat CP1252 (aka "Win-Latin-1") as CP1252 but 3# can be fooled into seeing it as UTF8. 4 5use strict; 6use warnings; 7use Test::More tests => 5; 8 9# fail with the supplied diagnostic 10 11use Pod::Simple::DumpAsXML; 12use Pod::Simple::XMLOutStream; 13 14 15# Initial, isolated, non-ASCII byte triggers CP1252 guess and later 16# multi-byte sequence is not considered by heuristic. 17 18my $x97; 19my $x91; 20my $dash; 21if ($] ge 5.007_003) { 22 $x97 = chr utf8::unicode_to_native(0x97); 23 $x91 = chr utf8::unicode_to_native(0x91); 24 $dash = '—'; 25} 26else { # Tests will fail for early EBCDICs 27 $x97 = chr 0x97; 28 $x91 = chr 0x91; 29 $dash = '--'; 30} 31 32my @output_lines = split m/[\r\n]+/, Pod::Simple::XMLOutStream->_out( qq{ 33 34=head1 NAME 35 36Em::Dash $x97 ${x91}CAF\xC9\x92 37 38=cut 39 40} ); 41 42my($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)}; 43if( $guess ) { 44 if( $guess eq 'CP1252' ) { 45 if( grep m{Dash $dash}, @output_lines ) { 46 ok 1; 47 } else { 48 fail "failed to find expected control character in output"; 49 } 50 } else { 51 fail "parser guessed wrong encoding expected 'CP1252' got '$guess'"; 52 } 53} else { 54 fail "parser failed to detect non-ASCII bytes in input"; 55} 56 57 58# Initial smart-quote character triggers CP1252 guess as expected 59 60@output_lines = split m/[\r\n]+/, Pod::Simple::XMLOutStream->_out( qq{ 61 62=head1 NAME 63 64Smart::Quote - ${x91}FUT\xC9\x92 65 66=cut 67 68} ); 69 70if (ord("A") != 65) { # ASCII-platform dependent test skipped on this platform 71 ok (1); 72} 73else { 74 ($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)}; 75 if( $guess ) { 76 if( $guess eq 'CP1252' ) { 77 ok 1; 78 } else { 79 fail "parser guessed wrong encoding expected 'CP1252' got '$guess'"; 80 } 81 } else { 82 fail "parser failed to detect non-ASCII bytes in input"; 83 } 84} 85 86 87# Initial accented character (E acute) followed by 'smart' apostrophe is legal 88# CP1252, which should be preferred over UTF-8 because the latter 89# interpretation would be "JOS" . \N{LATIN SMALL LETTER TURNED ALPHA} . "S 90# PLACE", and that \N{} letter is an IPA one. 91 92@output_lines = split m/[\r\n]+/, Pod::Simple::XMLOutStream->_out( qq{ 93 94=head1 NAME 95 96=head2 JOS\xC9\x92S PLACE 97 98=cut 99 100} ); 101 102if (ord("A") != 65) { # ASCII-platform dependent test skipped on this platform 103 ok (1); 104} 105else { 106 ($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)}; 107 if( $guess ) { 108 if( $guess eq 'CP1252' ) { 109 ok 1; 110 } else { 111 fail "parser guessed wrong encoding expected 'CP1252' got '$guess'"; 112 } 113 } else { 114 fail "parser failed to detect non-ASCII bytes in input"; 115 } 116} 117 118 119# The previous example used a CP1252 byte sequence that also happened to be a 120# valid UTF8 byte sequence. In this example we use an illegal UTF-8 sequence 121# (it needs a third byte), so must be 1252 122 123@output_lines = split m/[\r\n]+/, Pod::Simple::XMLOutStream->_out( qq{ 124 125=head1 NAME 126 127Smart::Apostrophe::Fail - L\xE9\x92Strange 128 129=cut 130 131} ); 132 133if (ord("A") != 65) { # ASCII-platform dependent test skipped on this platform 134 ok (1); 135} 136else { 137 ($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)}; 138 if( $guess ) { 139 if( $guess eq 'CP1252' ) { 140 ok 1; 141 } else { 142 fail "parser guessed wrong encoding expected 'CP1252' got '$guess'"; 143 } 144 } else { 145 fail "parser failed to detect non-ASCII bytes in input"; 146 } 147} 148 149# The following is a real word example of something in CP1252 expressible in 150# UTF-8, but doesn't make sense in UTF-8, contributed by Bo Lindbergh. 151# Muvrarášša is a Sami word 152 153@output_lines = split m/[\r\n]+/, Pod::Simple::XMLOutStream->_out( qq{ 154 155=head1 NAME 156 157Muvrar\xE1\x9A\x9Aa is a mountain in Norway 158 159=cut 160 161} ); 162 163if (ord("A") != 65) { # ASCII-platform dependent test skipped on this platform 164 ok (1); 165} 166else { 167 ($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)}; 168 if( $guess ) { 169 if( $guess eq 'CP1252' ) { 170 ok 1; 171 } else { 172 fail "parser guessed wrong encoding expected 'CP1252' got '$guess'"; 173 } 174 } else { 175 fail "parser failed to detect non-ASCII bytes in input"; 176 } 177} 178