1 # The encoding detection heuristic will choose UTF8 or Latin-1. The current
2 # implementation will usually treat CP1252 (aka "Win-Latin-1") as Latin-1 but
3 # can be fooled into seeing it as UTF8.
5 # Note 1: Neither guess is 'correct' since even if we choose Latin-1, all the
6 # smart quote symbols will be rendered as control characters
8 # Note 2: the guess is only applied if the source POD omits =encoding, so
9 # CP1252 source will render correctly if properly declared
21 BEGIN { plan tests => 5 };
25 use Pod::Simple::DumpAsXML;
26 use Pod::Simple::XMLOutStream;
29 # Initial, isolated, non-ASCII byte triggers Latin-1 guess and later
30 # multi-byte sequence is not considered by heuristic.
32 my @output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
36 Em::Dash \x97 \x91CAF\xC9\x92
42 my($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
44 if( $guess eq 'ISO8859-1' ) {
45 if( grep m{Dash (\x97|—|—)}, @output_lines ) {
49 print "# failed to find expected control character in output\n"
53 print "# parser guessed wrong encoding expected 'ISO8859-1' got '$guess'\n";
57 print "# parser failed to detect non-ASCII bytes in input\n";
61 # Initial smart-quote character triggers Latin-1 guess as expected
63 @output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
67 Smart::Quote - \x91FUT\xC9\x92
73 ($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
75 if( $guess eq 'ISO8859-1' ) {
79 print "# parser guessed wrong encoding expected 'ISO8859-1' got '$guess'\n";
83 print "# parser failed to detect non-ASCII bytes in input\n";
87 # Initial accented character followed by 'smart' apostrophe causes heuristic
88 # to choose UTF8 (a rather contrived example)
90 @output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
94 Smart::Apostrophe::Fail - L\xC9\x92STRANGE
100 ($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
102 if( $guess eq 'UTF-8' ) {
106 print "# parser guessed wrong encoding expected 'UTF-8' got '$guess'\n";
110 print "# parser failed to detect non-ASCII bytes in input\n";
114 # The previous example used a CP1252 byte sequence that also happened to be a
115 # valid UTF8 byte sequence. In this example the heuristic also guesses 'wrong'
116 # despite the byte sequence not being valid UTF8 (it's too short). This could
117 # arguably be 'fixed' by using a less naive regex.
119 @output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
123 Smart::Apostrophe::Fail - L\xE9\x92Strange
129 ($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
131 if( $guess eq 'UTF-8' ) {
135 print "# parser guessed wrong encoding expected 'UTF-8' got '$guess'\n";
139 print "# parser failed to detect non-ASCII bytes in input\n";