Re: [PATCH] lib/Pod/Html.pm plus a funky UT8-8 regex bug

[perl5.git] / lib / Pod / Html.pm
diff --git a/lib/Pod/Html.pm b/lib/Pod/Html.pm

index 239d305..b17a844 100644 (file)
--- a/lib/Pod/Html.pm
+++ b/lib/Pod/Html.pm
@@ -3,7 +3,7 @@ use strict;
  require Exporter;
  
  use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
-$VERSION = 1.0503;
+$VERSION = 1.08;
  @ISA = qw(Exporter);
  @EXPORT = qw(pod2html htmlify);
  @EXPORT_OK = qw(anchorify);
@@ -32,9 +32,20 @@ Converts files from pod format (see L<perlpod>) to HTML format.  It
  can automatically generate indexes and cross-references, and it keeps
  a cache of things it knows how to cross-reference.
  
-=head1 ARGUMENTS
+=head1 FUNCTIONS
  
-Pod::Html takes the following arguments:
+=head2 pod2html
+
+    pod2html("pod2html",
+             "--podpath=lib:ext:pod:vms",
+             "--podroot=/usr/src/perl",
+             "--htmlroot=/perl/nmanual",
+             "--libpods=perlfunc:perlguts:perlvar:perlrun:perlop",
+             "--recurse",
+             "--infile=foo.pod",
+             "--outfile=/perl/nmanual/foo.html");
+
+pod2html takes the following arguments:
  
  =over 4
  
@@ -187,16 +198,20 @@ Display progress messages.  By default, they won't be displayed.
  
  =back
  
-=head1 EXAMPLE
+=head2 htmlify
  
-    pod2html("pod2html",
-            "--podpath=lib:ext:pod:vms",
-            "--podroot=/usr/src/perl",
-            "--htmlroot=/perl/nmanual",
-            "--libpods=perlfunc:perlguts:perlvar:perlrun:perlop",
-            "--recurse",
-            "--infile=foo.pod",
-            "--outfile=/perl/nmanual/foo.html");
+    htmlify($heading);
+
+Converts a pod section specification to a suitable section specification
+for HTML. Note that we keep spaces and special characters except 
+C<", ?> (Netscape problem) and the hyphen (writer's problem...).
+
+=head2 anchorify
+
+    anchorify(@heading);
+
+Similar to C<htmlify()>, but turns non-alphanumerics into underscores.  Note
+that C<anchorify()> is not exported by default.
  
  =head1 ENVIRONMENT
  
@@ -216,7 +231,6 @@ This program is distributed under the Artistic License.
  
  =cut
  
-
  my($Cachedir);
  my($Dircache, $Itemcache);
  my @Begin_Stack;
@@ -252,7 +266,6 @@ my %Items = ();                     # associative array used to find the location
  
  my %Local_Items;
  my $Is83;
-my $PTQuote;
  
  my $Curdir = File::Spec->curdir;
  
@@ -303,7 +316,6 @@ sub init_globals {
                                 #   to prevent the first <hr /> directive.
      $Paragraph = '';           # which paragraph we're processing (used
                                 #   for error messages)
-    $PTQuote = 0;               # status of double-quote conversion
      %Sections = ();            # sections within this page
  
      %Local_Items = ();
@@ -485,21 +497,31 @@ END_OF_HEAD
      # still generate an index, but surround it with an html comment.
      # that way some other program can extract it if desired.
      $index =~ s/--+/-/g;
-    print HTML "<p><a name=\"__index__\"></a></p>\n";
-    print HTML "<!-- INDEX BEGIN -->\n";
-    print HTML "<!--\n" unless $Doindex;
-    print HTML $index;
-    print HTML "-->\n" unless $Doindex;
-    print HTML "<!-- INDEX END -->\n\n";
-    print HTML "<hr />\n" if $Doindex and $index;
+
+    my $hr = ($Doindex and $index) ? qq(<hr name="index" />) : "";
+
+    unless ($Doindex)
+    {
+        $index = qq(<!--\n$index\n-->\n);
+    }
+
+    print HTML << "END_OF_INDEX";
+
+<!-- INDEX BEGIN -->
+<div name="index">
+<p><a name=\"__index__\"></a></p>
+$index
+$hr
+</div>
+<!-- INDEX END -->
+
+END_OF_INDEX
  
      # now convert this file
      my $after_item;             # set to true after an =item
      my $need_dd = 0;
      warn "Converting input file $Podfile\n" if $Verbose;
      foreach my $i (0..$#poddata){
-        $PTQuote = 0; # status of quote conversion
-
         $_ = $poddata[$i];
         $Paragraph = $i+1;
         if (/^(=.*)/s) {        # is it a pod directive?
@@ -1107,7 +1129,7 @@ my $EmittedItem;
  
  sub emit_item_tag($$$){
      my( $otext, $text, $compact ) = @_;
-    my $item = fragment_id( $text );
+    my $item = fragment_id( $text , -generate);
  
      $EmittedItem = $item;
      ### print STDERR "emit_item_tag=$item ($text)\n";
@@ -1116,9 +1138,9 @@ sub emit_item_tag($$$){
      if ($Items_Named{$item}++) {
         print HTML process_text( \$otext );
      } else {
-        my $name = 'item_' . $item;
+        my $name = $item;
          $name = anchorify($name);
-       print HTML qq{<a name="$name">}, process_text( \$otext ), '</a>';
+       print HTML qq{<a name="$name" class="item">}, process_text( \$otext ), '</a>';
      }
      print HTML "</strong>\n";
      undef( $EmittedItem );
@@ -1393,12 +1415,12 @@ sub process_pre {
  #
  sub pure_text($){
      my $text = shift();
-    process_puretext( $text, \$PTQuote, 1 );
+    process_puretext( $text, 1 );
  }
  
  sub inIS_text($){
      my $text = shift();
-    process_puretext( $text, \$PTQuote, 0 );
+    process_puretext( $text, 0 );
  }
  
  #
@@ -1406,7 +1428,7 @@ sub inIS_text($){
  #  double-quotes and handling implicit C<> links.
  #
  sub process_puretext {
-    my($text, $quote, $notinIS) = @_;
+    my($text, $notinIS) = @_;
  
      ## Guessing at func() or [\$\@%&]*var references in plain text is destined
      ## to produce some strange looking ref's. uncomment to disable:
@@ -1414,13 +1436,6 @@ sub process_puretext {
  
      my(@words, $lead, $trail);
  
-    # convert double-quotes to single-quotes
-    if( $$quote && $text =~ s/"/''/s ){
-        $$quote = 0;
-    }
-    while ($text =~ s/"([^"]*)"/``$1''/sg) {};
-    $$quote = 1 if $text =~ s/"/``/s;
-
      # keep track of leading and trailing white-space
      $lead  = ($text =~ s/\A(\s+)//s ? $1 : "");
      $trail = ($text =~ s/(\s+)\Z//s ? $1 : "");
@@ -1486,9 +1501,26 @@ sub process_text {
      return if $Ignore;
      my( $tref ) = @_;
      my $res = process_text1( 0, $tref );
+    $res =~ s/\s+$//s;
      $$tref = $res;
  }
  
+sub process_text_rfc_links {
+    my $text = shift;
+
+    # For every "RFCnnnn" or "RFC nnn", link it to the authoritative
+    # ource. Do not use the /i modifier here. Require "RFC" to be written in
+    #  in capital letters.
+
+    $text =~ s{
+       (?<=[^<>[:alpha:]])           # Make sure this is not an URL already
+       (RFC\s*([0-9]{1,5}))(?![0-9]) # max 5 digits
+    }
+    {<a href="http://www.ietf.org/rfc/rfc$3.txt" class="rfc">$1</a>}gx;
+
+    $text;
+}
+
  sub process_text1($$;$$){
      my( $lev, $rstr, $func, $closing ) = @_;
      my $res = '';
@@ -1523,11 +1555,11 @@ sub process_text1($$;$$){
         $res = "&$escape;";
  
      } elsif( $func eq 'F' ){
-       # F<filename> - italizice
-       $res = '<em>' . process_text1( $lev, $rstr ) . '</em>';
+       # F<filename> - italicize
+       $res = '<em class="file">' . process_text1( $lev, $rstr ) . '</em>';
  
      } elsif( $func eq 'I' ){
-       # I<text> - italizice
+       # I<text> - italicize
         $res = '<em>' . process_text1( $lev, $rstr ) . '</em>';
  
      } elsif( $func eq 'L' ){
@@ -1677,6 +1709,7 @@ sub process_text1($$;$$){
         } else {
             warn "$0: $Podfile: undelimited $func<> in paragraph $Paragraph.\n" unless $Quiet;
         }
+       $res = process_text_rfc_links($res);
      }
      return $res;
  }
@@ -1821,10 +1854,9 @@ sub page_sect($$) {
             $section = "#$section" if $section;
              ### print STDERR "...section=$section\n";
  
-           # check if there is a .pod with the page name
-           if ($Pages{$page} =~ /([^:]*)\.pod:/) {
-               $link = "$Htmlroot/$1.html$section";
-           } elsif ($Pages{$page} =~ /([^:]*)\.pm:/) {
+           # check if there is a .pod with the page name.
+           # for L<Foo>, Foo.(pod|pm) is preferred to A/Foo.(pod|pm)
+           if ($Pages{$page} =~ /([^:]*)\.(?:pod|pm):/) {
                 $link = "$Htmlroot/$1.html$section";
             } else {
                 $link = "";
@@ -1981,7 +2013,8 @@ sub htmlify {
      $heading =~ s/\s+\Z//;
      $heading =~ s/\A\s+//;
      # The hyphen is a disgrace to the English language.
-    $heading =~ s/[-"?]//g;
+    # $heading =~ s/[-"?]//g;
+    $heading =~ s/["?]//g;
      $heading = lc( $heading );
      return $heading;
  }
@@ -2026,7 +2059,7 @@ sub depod1($;$$){
    return $res unless defined $$rstr;
    if( ! defined( $func ) ){
        # skip to next begin of an interior sequence
-      while( $$rstr =~ s/\A(.*?)([BCEFILSXZ])<(<+[^\S\n]+)?// ){
+      while( $$rstr =~ s/\A(.*?)([BCEFILSXZ])<(<+[^\S\n]+)?//s ){
           # recurse into its text
           $res .= $1 . depod1( $rstr, $2, closing $3);
        }
@@ -2045,7 +2078,7 @@ sub depod1($;$$){
        # all others: either recurse into new function or
        # terminate at closing angle bracket
        my $term = pattern $closing;
-      while( $$rstr =~ s/\A(.*?)(([BCEFILSXZ])<(<+[^\S\n]+)?|$term)// ){
+      while( $$rstr =~ s/\A(.*?)(([BCEFILSXZ])<(<+[^\S\n]+)?|$term)//s ){
           $res .= $1;
           last unless $3;
            $res .= depod1( $rstr, $3, closing $4 );
@@ -2057,14 +2090,69 @@ sub depod1($;$$){
    return $res;
  }
  
+{
+    my %seen;   # static fragment record hash
+
+sub fragment_id_readable {
+    my $text     = shift;
+    my $generate = shift;   # optional flag
+
+    my $orig = $text;
+
+    # leave the words for the fragment identifier,
+    # change everything else to underbars.
+    $text =~ s/[^A-Za-z0-9_]+/_/g; # do not use \W to avoid locale dependency.
+    $text =~ s/_{2,}/_/g;
+    $text =~ s/\A_//;
+    $text =~ s/_\Z//;
+
+    unless ($text)
+    {
+        # Nothing left after removing punctuation, so leave it as is
+        # E.g. if option is named: "=item -#"
+
+        $text = $orig;
+    }
+
+    if ($generate) {
+        if ( exists $seen{$text} ) {
+            # This already exists, make it unique
+            $seen{$text}++;
+            $text = $text . $seen{$text};
+        } else {
+            $seen{$text} = 1;  # first time seen this fragment
+        }
+    }
+
+    $text;
+}}
+
+my @HC;
+sub fragment_id_obfuscated {  # This was the old "_2d_2d__"
+    my $text     = shift;
+    my $generate = shift;   # optional flag
+
+    # text? Normalize by obfuscating the fragment id to make it unique
+    $text =~ s/\s+/_/sg;
+
+    $text =~ s{(\W)}{
+        defined( $HC[ord($1)] ) ? $HC[ord($1)]
+        : ( $HC[ord($1)] = sprintf( "%%%02X", ord($1) ) ) }gxe;
+    $text = substr( $text, 0, 50 );
+
+    $text;
+}
+
  #
  # fragment_id - construct a fragment identifier from:
  #   a) =item text
  #   b) contents of C<...>
  #
-my @HC;
+
  sub fragment_id {
-    my $text = shift();
+    my $text     = shift;
+    my $generate = shift;   # optional flag
+
      $text =~ s/\s+\Z//s;
      if( $text ){
         # a method or function?
@@ -2082,17 +2170,12 @@ sub fragment_id {
  
         # honour the perlfunc manpage: func [PAR[,[ ]PAR]...]
         # and some funnies with ... Module ...
-       return $1 if $text =~ m{^([a-z\d_]+)(\s+[A-Z\d,/& ]+)?$};
+       return $1 if $text =~ m{^([a-z\d_]+)(\s+[A-Z,/& ][A-Z\d,/& ]*)?$};
         return $1 if $text =~ m{^([a-z\d]+)\s+Module(\s+[A-Z\d,/& ]+)?$};
  
-       # text? normalize!
-       $text =~ s/\s+/_/sg;
-       $text =~ s{(\W)}{
-         defined( $HC[ord($1)] ) ? $HC[ord($1)]
-                 : ( $HC[ord($1)] = sprintf( "%%%02X", ord($1) ) ) }gxe;
-        $text = substr( $text, 0, 50 );
+       return fragment_id_readable($text, $generate);
      } else {
-       return undef();
+       return;
      }
  }