cpan/Pod-Simple/t/encod04.t

   1 # The encoding detection heuristic will choose UTF8 or Latin-1.  The current
   2 # implementation will usually treat CP1252 (aka "Win-Latin-1") as Latin-1 but
   3 # can be fooled into seeing it as UTF8.
   4 #
   5 # Note 1: Neither guess is 'correct' since even if we choose Latin-1, all the
   6 #         smart quote symbols will be rendered as control characters
   7 #
   8 # Note 2: the guess is only applied if the source POD omits =encoding, so
   9 #         CP1252 source will render correctly if properly declared
  10 #
  11
  12 BEGIN {
  13     if($ENV{PERL_CORE}) {
  14         chdir 't';
  15         @INC = '../lib';
  16     }
  17 }
  18
  19 use strict;
  20 use Test;
  21 BEGIN { plan tests => 5 };
  22
  23 ok 1;
  24
  25 use Pod::Simple::DumpAsXML;
  26 use Pod::Simple::XMLOutStream;
  27
  28
  29 # Initial, isolated, non-ASCII byte triggers Latin-1 guess and later
  30 # multi-byte sequence is not considered by heuristic.
  31
  32 my @output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
  33
  34 =head1 NAME
  35
  36 Em::Dash \x97 \x91CAF\xC9\x92
  37
  38 =cut
  39
  40 } );
  41
  42 my($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
  43 if( $guess ) {
  44   if( $guess eq 'ISO8859-1' ) {
  45     if( grep m{Dash (\x97|&#x97;|&#151;)}, @output_lines ) {
  46       ok 1;
  47     } else {
  48       ok 0;
  49       print "# failed to find expected control character in output\n"
  50     }
  51   } else {
  52     ok 0;
  53     print "# parser guessed wrong encoding expected 'ISO8859-1' got '$guess'\n";
  54   }
  55 } else {
  56   ok 0;
  57   print "# parser failed to detect non-ASCII bytes in input\n";
  58 }
  59
  60
  61 # Initial smart-quote character triggers Latin-1 guess as expected
  62
  63 @output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
  64
  65 =head1 NAME
  66
  67 Smart::Quote - \x91FUT\xC9\x92
  68
  69 =cut
  70
  71 } );
  72
  73 ($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
  74 if( $guess ) {
  75   if( $guess eq 'ISO8859-1' ) {
  76     ok 1;
  77   } else {
  78     ok 0;
  79     print "# parser guessed wrong encoding expected 'ISO8859-1' got '$guess'\n";
  80   }
  81 } else {
  82   ok 0;
  83   print "# parser failed to detect non-ASCII bytes in input\n";
  84 }
  85
  86
  87 # Initial accented character followed by 'smart' apostrophe causes heuristic
  88 # to choose UTF8 (a rather contrived example)
  89
  90 @output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
  91
  92 =head1 NAME
  93
  94 Smart::Apostrophe::Fail - L\xC9\x92STRANGE
  95
  96 =cut
  97
  98 } );
  99
 100 ($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
 101 if( $guess ) {
 102   if( $guess eq 'UTF-8' ) {
 103     ok 1;
 104   } else {
 105     ok 0;
 106     print "# parser guessed wrong encoding expected 'UTF-8' got '$guess'\n";
 107   }
 108 } else {
 109   ok 0;
 110   print "# parser failed to detect non-ASCII bytes in input\n";
 111 }
 112
 113
 114 # The previous example used a CP1252 byte sequence that also happened to be a
 115 # valid UTF8 byte sequence.  In this example the heuristic also guesses 'wrong'
 116 # despite the byte sequence not being valid UTF8 (it's too short).  This could
 117 # arguably be 'fixed' by using a less naive regex.
 118
 119 @output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
 120
 121 =head1 NAME
 122
 123 Smart::Apostrophe::Fail - L\xE9\x92Strange
 124
 125 =cut
 126
 127 } );
 128
 129 ($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
 130 if( $guess ) {
 131   if( $guess eq 'UTF-8' ) {
 132     ok 1;
 133   } else {
 134     ok 0;
 135     print "# parser guessed wrong encoding expected 'UTF-8' got '$guess'\n";
 136   }
 137 } else {
 138   ok 0;
 139   print "# parser failed to detect non-ASCII bytes in input\n";
 140 }
 141
 142
 143 exit;