Unicode documentation updates

[perl5.git] / pod / perlfaq5.pod
diff --git a/pod/perlfaq5.pod b/pod/perlfaq5.pod

index e6c1dcc..09da5bb 100644 (file)
--- a/pod/perlfaq5.pod
+++ b/pod/perlfaq5.pod
@@ -1,6 +1,6 @@
  =head1 NAME
  
-perlfaq5 - Files and Formats ($Revision: 10126 $)
+perlfaq5 - Files and Formats
  
  =head1 DESCRIPTION
  
@@ -10,56 +10,88 @@ formats, and footers.
  =head2 How do I flush/unbuffer an output filehandle?  Why must I do this?
  X<flush> X<buffer> X<unbuffer> X<autoflush>
  
-Perl does not support truly unbuffered output (except insofar as you
-can C<syswrite(OUT, $char, 1)>), although it does support is "command
-buffering", in which a physical write is performed after every output
-command.
-
-The C standard I/O library (stdio) normally buffers characters sent to
-devices so that there isn't a system call for each byte. In most stdio
-implementations, the type of output buffering and the size of the
-buffer varies according to the type of device. Perl's C<print()> and
-C<write()> functions normally buffer output, while C<syswrite()>
-bypasses buffering all together.
-
-If you want your output to be sent immediately when you execute
-C<print()> or C<write()> (for instance, for some network protocols),
-you must set the handle's autoflush flag. This flag is the Perl
-variable C<$|> and when it is set to a true value, Perl will flush the
-handle's buffer after each C<print()> or C<write()>. Setting C<$|>
-affects buffering only for the currently selected default filehandle.
-You choose this handle with the one argument C<select()> call (see
-L<perlvar/$E<verbar>> and L<perlfunc/select>).
-
-Use C<select()> to choose the desired handle, then set its
-per-filehandle variables.
-
-       $old_fh = select(OUTPUT_HANDLE);
-       $| = 1;
-       select($old_fh);
+(contributed by brian d foy)
  
-Some modules offer object-oriented access to handles and their
-variables, although they may be overkill if this is the only thing you
-do with them.  You can use C<IO::Handle>:
+You might like to read Mark Jason Dominus's "Suffering From Buffering"
+at http://perl.plover.com/FAQs/Buffering.html .
  
-       use IO::Handle;
-       open my( $printer ), ">", "/dev/printer");   # but is this?
-       $printer->autoflush(1);
+Perl normally buffers output so it doesn't make a system call for every
+bit of output. By saving up output, it makes fewer expensive system calls.
+For instance, in this little bit of code, you want to print a dot to the
+screen for every line you process to watch the progress of your program.
+Instead of seeing a dot for every line, Perl buffers the output and you
+have a long wait before you see a row of 50 dots all at once:
+
+       # long wait, then row of dots all at once
+       while( <> ) {
+               print ".";
+               print "\n" unless ++$count % 50;
+
+               #... expensive line processing operations
+               }
+
+To get around this, you have to unbuffer the output filehandle, in this
+case, C<STDOUT>. You can set the special variable C<$|> to a true value
+(mnemonic: making your filehandles "piping hot"):
+
+       $|++;
+
+       # dot shown immediately
+       while( <> ) {
+               print ".";
+               print "\n" unless ++$count % 50;
+
+               #... expensive line processing operations
+               }
+
+The C<$|> is one of the per-filehandle special variables, so each
+filehandle has its own copy of its value. If you want to merge
+standard output and standard error for instance, you have to unbuffer
+each (although STDERR might be unbuffered by default):
+
+       {
+       my $previous_default = select(STDOUT);  # save previous default
+       $|++;                                   # autoflush STDOUT
+       select(STDERR);
+       $|++;                                   # autoflush STDERR, to be sure
+       select($previous_default);              # restore previous default
+       }
  
-or C<IO::Socket> (which inherits from C<IO::Handle>):
+       # now should alternate . and +
+       while( 1 )
+               {
+               sleep 1;
+               print STDOUT ".";
+               print STDERR "+";
+               print STDOUT "\n" unless ++$count % 25;
+               }
+
+Besides the C<$|> special variable, you can use C<binmode> to give
+your filehandle a C<:unix> layer, which is unbuffered:
  
-       use IO::Socket;           # this one is kinda a pipe?
-       my $sock = IO::Socket::INET->new( 'www.example.com:80' );
+       binmode( STDOUT, ":unix" );
+
+       while( 1 ) {
+               sleep 1;
+               print ".";
+               print "\n" unless ++$count % 50;
+               }
  
-       $sock->autoflush();
+For more information on output layers, see the entries for C<binmode>
+and C<open> in L<perlfunc>, and the C<PerlIO> module documentation.
  
-You can also flush an C<IO::Handle> object without setting
-C<autoflush>. Call the C<flush> method to flush the buffer yourself:
+If you are using C<IO::Handle> or one of its subclasses, you can
+call the C<autoflush> method to change the settings of the
+filehandle:
  
         use IO::Handle;
-       open my( $printer ), ">", "/dev/printer"); 
-       $printer->flush; # one time flush
+       open my( $io_fh ), ">", "output.txt";
+       $io_fh->autoflush(1);
  
+The C<IO::Handle> objects also have a C<flush> method. You can flush
+the buffer any time you want without auto-buffering
+
+       $io_fh->flush;
  
  =head2 How do I change, delete, or insert a line in a file, or append to the beginning of a file?
  X<file, editing>
@@ -95,7 +127,7 @@ the loop that prints the existing lines.
         open my $in,  '<',  $file      or die "Can't read old file: $!";
         open my $out, '>', "$file.new" or die "Can't write new file: $!";
  
-       print "# Add this line to the top\n"; # <--- HERE'S THE MAGIC
+       print $out "# Add this line to the top\n"; # <--- HERE'S THE MAGIC
  
         while( <$in> )
                 {
@@ -112,7 +144,7 @@ be sure that you're supposed to do that on every line!
         open my $in,  '<',  $file      or die "Can't read old file: $!";
         open my $out, '>', "$file.new" or die "Can't write new file: $!";
  
-       print "# Add this line to the top\n";
+       print $out "# Add this line to the top\n";
  
         while( <$in> )
                 {
@@ -141,7 +173,7 @@ print it. After that, read the rest of the lines and print those:
                 {
                 print $out $_;
                 }
-               
+
  To skip lines, use the looping controls. The C<next> in this example
  skips comment lines, and the C<last> stops all processing once it
  encounters either C<__END__> or C<__DATA__>.
@@ -164,7 +196,7 @@ example skips every fifth line:
                 }
  
  If, for some odd reason, you really want to see the whole file at once
-rather than processing line by line, you can slurp it in (as long as
+rather than processing line-by-line, you can slurp it in (as long as
  you can fit the whole thing in memory!):
  
         open my $in,  '<',  $file      or die "Can't read old file: $!"
@@ -243,6 +275,52 @@ proper text file, so this may report one fewer line than you expect.
  
  This assumes no funny games with newline translations.
  
+=head2 How do I delete the last N lines from a file?
+X<lines> X<file>
+
+(contributed by brian d foy)
+
+The easiest conceptual solution is to count the lines in the 
+file then start at the beginning and print the number of lines
+(minus the last N) to a new file.
+
+Most often, the real question is how you can delete the last N
+lines without making more than one pass over the file, or how to 
+do it with a lot of copying. The easy concept is the hard reality when
+you might have millions of lines in your file.
+
+One trick is to use C<File::ReadBackwards>, which starts at the end of 
+the file. That module provides an object that wraps the real filehandle
+to make it easy for you to move around the file. Once you get to the 
+spot you need, you can get the actual filehandle and work with it as
+normal. In this case, you get the file position at the end of the last
+line you want to keep and truncate the file to that point:
+
+       use File::ReadBackwards;
+       
+       my $filename = 'test.txt';
+       my $Lines_to_truncate = 2;
+
+       my $bw = File::ReadBackwards->new( $filename ) 
+               or die "Could not read backwards in [$filename]: $!";
+       
+       my $lines_from_end = 0;
+       until( $bw->eof or $lines_from_end == $Lines_to_truncate ) 
+               {
+               print "Got: ", $bw->readline;
+               $lines_from_end++;
+               }
+       
+       truncate( $filename, $bw->tell );
+
+The C<File::ReadBackwards> module also has the advantage of setting
+the input record separator to a regular expression.
+
+You can also use the C<Tie::File> module which lets you access
+the lines through a tied array. You can use normal array operations
+to modify your file, including setting the last index and using 
+C<splice>.
+
  =head2 How can I use Perl's C<-i> option from within a program?
  X<-i> X<in-place>
  
@@ -270,11 +348,11 @@ leaving a backup of the original data from each file in a new
  C<.c.orig> file.
  
  =head2 How can I copy a file?
-X<copy> X<file, copy>
+X<copy> X<file, copy> X<File::Copy>
  
  (contributed by brian d foy)
  
-Use the File::Copy module. It comes with Perl and can do a
+Use the C<File::Copy> module. It comes with Perl and can do a
  true copy across file systems, and it does its magic in
  a portable fashion.
  
@@ -282,16 +360,17 @@ a portable fashion.
  
         copy( $original, $new_copy ) or die "Copy failed: $!";
  
-If you can't use File::Copy, you'll have to do the work yourself:
+If you can't use C<File::Copy>, you'll have to do the work yourself:
  open the original file, open the destination file, then print
-to the destination file as you read the original.
+to the destination file as you read the original. You also have to
+remember to copy the permissions, owner, and group to the new file.
  
  =head2 How do I make a temporary file name?
  X<file, temporary>
  
  If you don't need to know the name of the file, you can use C<open()>
-with C<undef> in place of the file name.  The C<open()> function
-creates an anonymous temporary file.
+with C<undef> in place of the file name.  In Perl 5.8 or later, the
+C<open()> function creates an anonymous temporary file:
  
         open my $tmp, '+>', undef or die $!;
  
@@ -340,7 +419,7 @@ temporary files in one process, use a counter:
                         return ();
                         }
                 }
-               
+
         }
  
  =head2 How can I manipulate fixed-record-length files?
@@ -524,13 +603,13 @@ X<write, into a string>
  See L<perlform/"Accessing Formatting Internals"> for an C<swrite()> function.
  
  =head2 How can I open a filehandle to a string?
-X<string>, X<open>, X<IO::Scalar>, X<filehandle>
+X<string> X<open> X<IO::String> X<filehandle>
  
  (contributed by Peter J. Holzer, hjp-usenet2@hjp.at)
  
-Since Perl 5.8.0, you can pass a reference to a scalar instead of the
-filename to create a file handle which you can used to read from or write to
-a string:
+Since Perl 5.8.0 a file handle referring to a string can be created by
+calling open with a reference to that string instead of the filename.
+This file handle can then be used to read from or write to the string:
  
         open(my $fh, '>', \$string) or die "Could not open string for writing";
         print $fh "foo\n";
@@ -581,11 +660,11 @@ It is easier to see with comments:
  =head2 How can I translate tildes (~) in a filename?
  X<tilde> X<tilde expansion>
  
-Use the <> (glob()) operator, documented in L<perlfunc>.  Older
-versions of Perl require that you have a shell installed that groks
-tildes.  Recent perl versions have this feature built in. The
-File::KGlob module (available from CPAN) gives more portable glob
-functionality.
+Use the E<lt>E<gt> (C<glob()>) operator, documented in L<perlfunc>.
+Versions of Perl older than 5.6 require that you have a shell
+installed that groks tildes.  Later versions of Perl have this feature
+built in. The C<File::KGlob> module (available from CPAN) gives more
+portable glob functionality.
  
  Within Perl, you may use this directly:
  
@@ -692,10 +771,11 @@ one that doesn't use the shell to do globbing.
  =head2 Is there a leak/bug in glob()?
  X<glob>
  
-Due to the current implementation on some operating systems, when you
-use the glob() function or its angle-bracket alias in a scalar
-context, you may cause a memory leak and/or unpredictable behavior.  It's
-best therefore to use glob() only in list context.
+(contributed by brian d foy)
+
+Starting with Perl 5.6.0, C<glob> is implemented internally rather
+than relying on an external resource. As such, memory issues with 
+C<glob> aren't a problem in modern perls.
  
  =head2 How can I open a file with a leading ">" or trailing blanks?
  X<filename, special characters>
@@ -715,21 +795,19 @@ characters in the filename as special.
         open FILE, ">", ">file";     # filename is ">file"
  
  =head2 How can I reliably rename a file?
-X<rename> X<mv> X<move> X<file, rename> X<ren>
+X<rename> X<mv> X<move> X<file, rename>
  
  If your operating system supports a proper mv(1) utility or its
  functional equivalent, this works:
  
         rename($old, $new) or system("mv", $old, $new);
  
-It may be more portable to use the File::Copy module instead.
+It may be more portable to use the C<File::Copy> module instead.
  You just copy to the new file to the new name (checking return
  values), then delete the old one.  This isn't really the same
-semantically as a rename(), which preserves meta-information like
+semantically as a C<rename()>, which preserves meta-information like
  permissions, timestamps, inode info, etc.
  
-Newer versions of File::Copy export a move() function.
-
  =head2 How can I lock a file?
  X<lock> X<file, lock> X<flock>
  
@@ -826,31 +904,33 @@ If the count doesn't impress your friends, then the code might.  :-)
  =head2 All I want to do is append a small amount of text to the end of a file.  Do I still have to use locking?
  X<append> X<file, append>
  
-If you are on a system that correctly implements flock() and you use the
-example appending code from "perldoc -f flock" everything will be OK
-even if the OS you are on doesn't implement append mode correctly (if
-such a system exists.) So if you are happy to restrict yourself to OSs
-that implement flock() (and that's not really much of a restriction)
-then that is what you should do.
+If you are on a system that correctly implements C<flock> and you use
+the example appending code from "perldoc -f flock" everything will be
+OK even if the OS you are on doesn't implement append mode correctly
+(if such a system exists.) So if you are happy to restrict yourself to
+OSs that implement C<flock> (and that's not really much of a
+restriction) then that is what you should do.
  
  If you know you are only going to use a system that does correctly
-implement appending (i.e. not Win32) then you can omit the seek() from
-the code in the previous answer.
-
-If you know you are only writing code to run on an OS and filesystem that
-does implement append mode correctly (a local filesystem on a modern
-Unix for example), and you keep the file in block-buffered mode and you
-write less than one buffer-full of output between each manual flushing
-of the buffer then each bufferload is almost guaranteed to be written to
-the end of the file in one chunk without getting intermingled with
-anyone else's output. You can also use the syswrite() function which is
-simply a wrapper around your systems write(2) system call.
+implement appending (i.e. not Win32) then you can omit the C<seek>
+from the code in the previous answer.
+
+If you know you are only writing code to run on an OS and filesystem
+that does implement append mode correctly (a local filesystem on a
+modern Unix for example), and you keep the file in block-buffered mode
+and you write less than one buffer-full of output between each manual
+flushing of the buffer then each bufferload is almost guaranteed to be
+written to the end of the file in one chunk without getting
+intermingled with anyone else's output. You can also use the
+C<syswrite> function which is simply a wrapper around your system's
+C<write(2)> system call.
  
  There is still a small theoretical chance that a signal will interrupt
-the system level write() operation before completion.  There is also a
-possibility that some STDIO implementations may call multiple system
-level write()s even if the buffer was empty to start.  There may be some
-systems where this probability is reduced to zero.
+the system level C<write()> operation before completion. There is also
+a possibility that some STDIO implementations may call multiple system
+level C<write()>s even if the buffer was empty to start. There may be
+some systems where this probability is reduced to zero, and this is
+not a concern when using C<:perlio> instead of your system's STDIO.
  
  =head2 How do I randomly update a binary file?
  X<file, binary patch>
@@ -879,17 +959,15 @@ Don't forget them or you'll be quite sorry.
  =head2 How do I get a file's timestamp in perl?
  X<timestamp> X<file, timestamp>
  
-If you want to retrieve the time at which the file was last
-read, written, or had its meta-data (owner, etc) changed,
-you use the B<-A>, B<-M>, or B<-C> file test operations as
-documented in L<perlfunc>.  These retrieve the age of the
-file (measured against the start-time of your program) in
-days as a floating point number. Some platforms may not have
-all of these times.  See L<perlport> for details. To
-retrieve the "raw" time in seconds since the epoch, you
-would call the stat function, then use localtime(),
-gmtime(), or POSIX::strftime() to convert this into
-human-readable form.
+If you want to retrieve the time at which the file was last read,
+written, or had its meta-data (owner, etc) changed, you use the B<-A>,
+B<-M>, or B<-C> file test operations as documented in L<perlfunc>.
+These retrieve the age of the file (measured against the start-time of
+your program) in days as a floating point number. Some platforms may
+not have all of these times.  See L<perlport> for details. To retrieve
+the "raw" time in seconds since the epoch, you would call the stat
+function, then use C<localtime()>, C<gmtime()>, or
+C<POSIX::strftime()> to convert this into human-readable form.
  
  Here's an example:
  
@@ -955,7 +1033,7 @@ You can use the File::Slurp module to do it in one step.
         use File::Slurp;
  
         $all_of_it = read_file($filename); # entire file in scalar
-       @all_lines = read_file($filename); # one line perl element
+       @all_lines = read_file($filename); # one line per element
  
  The customary Perl approach for processing all the lines in a file is to
  do so one line at a time:
@@ -1090,7 +1168,7 @@ include also support for non-portable systems as well.
  The very first thing you should do is look into getting the Term::ReadKey
  extension from CPAN.  As we mentioned earlier, it now even has limited
  support for non-portable (read: not open systems, closed, proprietary,
-not POSIX, not Unix, etc) systems.
+not POSIX, not Unix, etc.) systems.
  
  You should also check out the Frequently Asked Questions list in
  comp.unix.* for things like this: the answer is essentially the same.
@@ -1201,9 +1279,9 @@ filehandle (perhaps you used C<POSIX::open>), you can use the
  C<close()> function from the C<POSIX> module:
  
         use POSIX ();
-       
+
         POSIX::close( $fd );
-       
+
  This should rarely be necessary, as the Perl C<close()> function is to be
  used for things that Perl opened itself, even if it was a dup of a
  numeric descriptor as with C<MHCONTEXT> above.  But if you really have
@@ -1263,7 +1341,10 @@ the permissions of the file govern whether you're allowed to.
  =head2 How do I select a random line from a file?
  X<file, selecting a random line>
  
-Here's an algorithm from the Camel Book:
+Short of loading the file into a database or pre-indexing the lines in
+the file, there are a couple of things that you can do.
+
+Here's a reservoir-sampling algorithm from the Camel Book:
  
         srand;
         rand($.) < 1 && ($line = $_) while <>;
@@ -1272,49 +1353,138 @@ This has a significant advantage in space over reading the whole file
  in.  You can find a proof of this method in I<The Art of Computer
  Programming>, Volume 2, Section 3.4.2, by Donald E. Knuth.
  
-You can use the File::Random module which provides a function
+You can use the C<File::Random> module which provides a function
  for that algorithm:
  
         use File::Random qw/random_line/;
         my $line = random_line($filename);
  
-Another way is to use the Tie::File module, which treats the entire
+Another way is to use the C<Tie::File> module, which treats the entire
  file as an array.  Simply access a random array element.
  
  =head2 Why do I get weird spaces when I print an array of lines?
  
-Saying
+(contributed by brian d foy)
+
+If you are seeing spaces between the elements of your array when
+you print the array, you are probably interpolating the array in
+double quotes:
+
+       my @animals = qw(camel llama alpaca vicuna);
+       print "animals are: @animals\n";
  
-       print "@lines\n";
+It's the double quotes, not the C<print>, doing this. Whenever you
+interpolate an array in a double quote context, Perl joins the
+elements with spaces (or whatever is in C<$">, which is a space by
+default):
  
-joins together the elements of C<@lines> with a space between them.
-If C<@lines> were C<("little", "fluffy", "clouds")> then the above
-statement would print
+       animals are: camel llama alpaca vicuna
  
-       little fluffy clouds
+This is different than printing the array without the interpolation:
  
-but if each element of C<@lines> was a line of text, ending a newline
-character C<("little\n", "fluffy\n", "clouds\n")> then it would print:
+       my @animals = qw(camel llama alpaca vicuna);
+       print "animals are: ", @animals, "\n";
  
-       little
-        fluffy
-        clouds
+Now the output doesn't have the spaces between the elements because
+the elements of C<@animals> simply become part of the list to
+C<print>:
  
-If your array contains lines, just print them:
+       animals are: camelllamaalpacavicuna
+
+You might notice this when each of the elements of C<@array> end with
+a newline. You expect to print one element per line, but notice that
+every line after the first is indented:
+
+       this is a line
+        this is another line
+        this is the third line
+
+That extra space comes from the interpolation of the array. If you
+don't want to put anything between your array elements, don't use the
+array in double quotes. You can send it to print without them:
  
         print @lines;
  
+=head2 How do I traverse a directory tree?
+
+(contributed by brian d foy)
+
+The C<File::Find> module, which comes with Perl, does all of the hard
+work to traverse a directory structure. It comes with Perl. You simply
+call the C<find> subroutine with a callback subroutine and the
+directories you want to traverse:
+
+       use File::Find;
+
+       find( \&wanted, @directories );
+
+       sub wanted {
+               # full path in $File::Find::name
+               # just filename in $_
+               ... do whatever you want to do ...
+               }
+
+The C<File::Find::Closures>, which you can download from CPAN, provides
+many ready-to-use subroutines that you can use with C<File::Find>.
+
+The C<File::Finder>, which you can download from CPAN, can help you
+create the callback subroutine using something closer to the syntax of
+the C<find> command-line utility:
+
+       use File::Find;
+       use File::Finder;
+
+       my $deep_dirs = File::Finder->depth->type('d')->ls->exec('rmdir','{}');
+
+       find( $deep_dirs->as_options, @places );
+
+The C<File::Find::Rule> module, which you can download from CPAN, has
+a similar interface, but does the traversal for you too:
+
+       use File::Find::Rule;
+
+       my @files = File::Find::Rule->file()
+                                                        ->name( '*.pm' )
+                                                        ->in( @INC );
+
+=head2 How do I delete a directory tree?
+
+(contributed by brian d foy)
+
+If you have an empty directory, you can use Perl's built-in C<rmdir>. If
+the directory is not empty (so, no files or subdirectories), you either
+have to empty it yourself (a lot of work) or use a module to help you.
+
+The C<File::Path> module, which comes with Perl, has a C<rmtree> which
+can take care of all of the hard work for you:
+
+       use File::Path qw(rmtree);
+
+       rmtree( \@directories, 0, 0 );
+
+The first argument to C<rmtree> is either a string representing a directory path
+or an array reference. The second argument controls progress messages, and the
+third argument controls the handling of files you don't have permissions to
+delete. See the C<File::Path> module for the details.
+
+=head2 How do I copy an entire directory?
+
+(contributed by Shlomi Fish)
+
+To do the equivalent of C<cp -R> (i.e. copy an entire directory tree
+recursively) in portable Perl, you'll either need to write something yourself
+or find a good CPAN module such as  L<File::Copy::Recursive>.
  =head1 REVISION
  
-Revision: $Revision: 10126 $
+Revision: $Revision$
  
-Date: $Date: 2007-10-27 21:29:20 +0200 (Sat, 27 Oct 2007) $
+Date: $Date$
  
  See L<perlfaq> for source control details and availability.
  
  =head1 AUTHOR AND COPYRIGHT
  
-Copyright (c) 1997-2007 Tom Christiansen, Nathan Torkington, and
+Copyright (c) 1997-2009 Tom Christiansen, Nathan Torkington, and
  other authors as noted. All rights reserved.
  
  This documentation is free; you can redistribute it and/or modify it