Integrate:

author Nicholas Clark <nick@ccl4.org>

Mon, 5 Mar 2007 22:34:25 +0000 (22:34 +0000)

committer Nicholas Clark <nick@ccl4.org>

Mon, 5 Mar 2007 22:34:25 +0000 (22:34 +0000)
author Nicholas Clark <nick@ccl4.org>
Mon, 5 Mar 2007 22:34:25 +0000 (22:34 +0000)
committer Nicholas Clark <nick@ccl4.org>
Mon, 5 Mar 2007 22:34:25 +0000 (22:34 +0000)
diff --git a/MANIFEST b/MANIFEST

index 4925f4e..3a458b7 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -1025,7 +1025,7 @@ ext/XS/APItest/MANIFEST           XS::APItest extension
  ext/XS/APItest/README          XS::APItest extension
  ext/XS/APItest/t/call.t                XS::APItest extension
  ext/XS/APItest/t/exception.t   XS::APItest extension
-ext/XS/APItest/t/hash.t                XS::APItest extension
+ext/XS/APItest/t/hash.t                XS::APItest: tests for hash related APIs
  ext/XS/APItest/t/my_cxt.t      XS::APItest: test MY_CXT interface
  ext/XS/APItest/t/printf.t      XS::APItest extension
  ext/XS/APItest/t/push.t                XS::APItest extension
@@ -2472,6 +2472,7 @@ pod/perlpodspec.pod               Perl plain old documentation format specification
  pod/perlport.pod               Perl portability guide
  pod/perlref.pod                        Perl references, the rest of the story
  pod/perlreftut.pod             Perl references short introduction
+pod/perlreguts.pod             Perl regular expression engine internals
  pod/perlre.pod                 Perl regular expressions, the rest of the story
  pod/perlrequick.pod            Perl regular expressions quick start
  pod/perlreref.pod              Perl regular expressions quick reference
@@ -2490,6 +2491,7 @@ pod/perltoot.pod          Perl OO tutorial, part 1
  pod/perltrap.pod               Perl traps for the unwary
  pod/perlunicode.pod            Perl Unicode support
  pod/perluniintro.pod           Perl Unicode introduction
+pod/perlunitut.pod             Perl Unicode tutorial
  pod/perlutil.pod               utilities packaged with the Perl distribution
  pod/perlvar.pod                        Perl predefined variables
  pod/perlxs.pod                 Perl XS application programming interface
diff --git a/pod.lst b/pod.lst

index bb2ba2e..6e2ce92 100644 (file)
--- a/pod.lst
+++ b/pod.lst
@@ -78,6 +78,7 @@ h Reference Manual
    perllocale           Perl locale support
    perluniintro         Perl Unicode introduction
    perlunicode          Perl Unicode support
+  perlunitut           Perl Unicode tutorial
    perlebcdic           Considerations for running Perl on EBCDIC platforms
  
    perlsec              Perl security
@@ -105,6 +106,7 @@ h Internals and C Language Interface
    perlclib             Internal replacements for standard C library functions
    perlguts             Perl internal functions for those doing extensions
    perlcall             Perl calling conventions from C
+  perlreguts           Perl regular expression engine internals
  
    perlapi              Perl API listing (autogenerated)
    perlintern           Perl internal functions (autogenerated)
diff --git a/pod/perl.pod b/pod/perl.pod

index 08e7f28..93afbd6 100644 (file)
--- a/pod/perl.pod
+++ b/pod/perl.pod
@@ -94,6 +94,7 @@ For ease of access, the Perl manual has been split up into several sections.
      perllocale         Perl locale support
      perluniintro       Perl Unicode introduction
      perlunicode        Perl Unicode support
+    perlunitut         Perl Unicode tutorial
      perlebcdic         Considerations for running Perl on EBCDIC platforms
  
      perlsec            Perl security
@@ -121,6 +122,7 @@ For ease of access, the Perl manual has been split up into several sections.
      perlclib           Internal replacements for standard C library functions
      perlguts           Perl internal functions for those doing extensions
      perlcall           Perl calling conventions from C
+    perlreguts         Perl regular expression engine internals
  
      perlapi            Perl API listing (autogenerated)
      perlintern         Perl internal functions (autogenerated)
diff --git a/pod/perlreguts.pod b/pod/perlreguts.pod

new file mode 100644 (file)

index 0000000..ce9f3c0
--- /dev/null
+++ b/pod/perlreguts.pod
@@ -0,0 +1,722 @@
+=head1 NAME
+
+perlreguts - Description of the Perl regular expression engine.
+
+=head1 DESCRIPTION
+
+This document is an attempt to shine some light on the guts of the regex
+engine and how it works. The regex engine represents a signifigant chunk
+of the perl codebase, but is relatively poorly understood. This document
+is a meagre attempt at addressing this situation. It is derived from the
+authors experience, comments in the source code, other papers on the
+regex engine, feedback in p5p, and no doubt other places as well.
+
+B<WARNING!> It should be clearly understood that this document
+represents the state of the regex engine as the author understands it at
+the time of writing. It is B<NOT> an API definition, it is purely an
+internals guide for those who want to hack the regex engine, or
+understand how the regex engine works. Readers of this document are
+expected to understand perls regex syntax and its usage in detail, if
+you are a beginner you are in the wrong the place.
+
+=head1 OVERVIEW
+
+=head2 A quick note on terms
+
+There is some debate as to whether to say 'regexp' or 'regex'. In this
+document we will use the term "regex" unless there is a special reason
+not to, and then we will explain why.
+
+When speaking about regexes we need to distinguish between their source
+code form and their internal form. In this document we will use the term
+"pattern" when we speak of their textual, source code form, the term
+"program" when we speak of their internal representation. These
+correspond to the terms C<S-regex> and C<B-regex> that Mark Jason
+Dominus employs in his paper on "Rx"[1].
+
+=head2 What is a regular expression engine?
+
+A regular expression engine is a program whose job is to efficiently
+find a section of a string that matches a set criteria of criteria. The
+criteria is expressed in text using a formal language. See perlre for a
+full definition of the language.
+
+So the job in less grandiose terms is to some turn a pattern into
+something the computer can efficiently use to find the matching point in
+the string.
+
+To do this we need to produce a program by parsing the text. We then
+need to execute the program to find the point in the string that
+matches. And we need to do the whole thing efficiently.
+
+=head2 Structure of a Regexp Program
+
+=head3 High Level
+
+Although it is a bit confusing and some object to the terminology it
+is worth taking a look at a comment that has
+been in regexp.h for years:
+
+I<This is essentially a linear encoding of a nondeterministic
+finite-state machine (aka syntax charts or "railroad normal form" in
+parsing technology).>
+
+The term "railroad normal form" is a bit esoteric, with "syntax
+diagram/charts", or "railroad diagram/charts" being more common terms.
+Nevertheless it provides a useful mental image of a regex program: Each
+node can be thought of as a unit of track, with a single entry and in
+most cases a single exit point (there are pieces of track that fork, but
+statistically not many),  and the total forms a system of track with a
+single entry and single exit point. The matching process can be thought
+of as a car that moves on the track, with the particular route through
+the system being determined by the character read at each possible
+connector point. A car can roll off the track at any point but it may
+not procede unless it matches the track...
+
+Thus the pattern C</foo(?:\w+|\d+|\s+)bar/> can be thought of as the
+following chart:
+
+                  [start]
+                     |
+                   <foo>
+                     |
+                 +---+---+
+                 |   |   |
+               <\w+> | <\s+>
+                 | <\d+> |
+                 |   |   |
+                 +---+---+
+                     |
+                   <bar>
+                     |
+                   [end]
+
+The truth of the matter is that perls regular expressions these days are
+way beyond such models, but they can help when trying to get your
+bearings, and they do match pretty closely with the current
+implementation.
+
+To be more precise we will say that a regex program is an encoding
+of a graph.  Each node in the graph corresponds to part of
+the original regex pattern, such as a literal string or a branch,
+and has a pointer to the nodes representing the next component
+to be matched. Since "node" and opcode are overloaded terms in the
+perl source, we will call the nodes in a regex program 'regops'.
+
+The program is represented by an array of regnode structures, one or
+more of which together represent a single regop of the program. Struct
+regnode is the smallest struct needed and has a field structure which is
+shared with all the other larger structures.
+
+"Next" pointers of all regops except BRANCH implement concatenation; a
+"next" pointer with a BRANCH on both ends of it is connecting two
+alternatives.  [Here we have one of the subtle syntax dependencies:  an
+individual BRANCH (as opposed to a collection of them) is never
+concatenated with anything because of operator precedence.
+
+The operand of some types of regop is a literal string; for others,
+it is a regop leading into a sub-program.  In particular, the operand
+of a BRANCH node is the first regop of the branch.
+
+B<NOTE>: As the railroad metaphor suggests this is B<not> a tree
+structure:  the tail of the branch connects to the thing following the
+set of BRANCHes.  It is a like a single line of railway track that
+splits as it goes into a station or railway yard and rejoins as it comes
+out the other side.
+
+=head3 Regops
+
+The base structure of a regop is defined in regexp.h as follows:
+
+    struct regnode {
+        U8  flags;    /* Various purposes, sometimes overriden */
+        U8  type;     /* Opcode value as specified by regnodes.h */
+        U16 next_off; /* Offset in size regnode */
+    };
+
+Other larger regnode-like structures are defined in regcomp.h. They
+are almost like subclasses in that they have the same fields as
+regnode, with possibly additional fields following in
+the structure, and in some cases the specific meaning (and name)
+of some of base fields are overriden. The following is a more
+complete description.
+
+=over 4
+
+=item regnode_1
+
+=item regnode_2
+
+regnode_1 structures have the same header, followed by a single
+four-byte argument; regnode_2 structures contain two two-byte
+arguments instead:
+
+    regnode_1                U32 arg1;
+    regnode_2                U16 arg1;  U16 arg2;
+
+=item regnode_string
+
+regnode_string structures, used for literal strings, follow the header
+with a one-byte length and then the string data. Strings are padded on
+the end with zero bytes so that the total length of the node is a
+multiple of four bytes:
+
+    regnode_string           char string[1];
+                             U8 str_len; (overides flags)
+
+=item regnode_charclass
+
+character classes are represented by regnode_charclass structures,
+which have a four-byte argument and then a 32-byte (256-bit) bitmap
+indicating which characters are included in the class.
+
+    regnode_charclass        U32 arg1;
+                             char bitmap[ANYOF_BITMAP_SIZE];
+
+=item regnode_charclass_class
+
+There is also a larger form of a char class structure used to represent
+POSIX char classes called regnode_charclass_class which contains the
+same fields plus an additional 4-byte (32-bit) bitmap indicating which
+POSIX char class have been included.
+
+    regnode_charclass_class  U32 arg1;
+                             char bitmap[ANYOF_BITMAP_SIZE];
+                             char classflags[ANYOF_CLASSBITMAP_SIZE];
+
+=back
+
+regnodes.h defines an array called regarglen[] which gives the size
+of each opcode in units of size regnode (4-byte). A macro is used
+to calculate the size of an EXACT node based on its C<str_len> field.
+
+The opcodes are defined in regnodes.h which is generated from
+regcomp.sym by regcomp.pl. Currently the maximum possible number
+of distinct opcodes is restricted to 256, with about 1/4 already
+used.
+
+There's a set of macros provided to make accessing the fields
+easier and more consistent. These include C<OP()> which is used to tell
+the type of a regnode-like structure, NEXT_OFF() which is the offset to
+the next node (more on this later), ARG(), ARG1(), ARG2(), ARG_SET(),
+and equivelents for reading and setting the arguments, STR_LEN(),
+STRING(), and OPERAND() for manipulating strings and regop bearing
+types.
+
+=head3 What opcode is next?
+
+There are three distinct concepts of "next" in the regex engine, and
+it is important to keep them clear.
+
+=over 4
+
+=item *
+
+There is the "next regnode" from a given regnode, a value which is
+rarely useful except that sometimes it matches up in terms of value
+with one of the others, and that sometimes the code assumes this to
+always be so.
+
+=item *
+
+There is the "next opcode" from a given opcode/regnode. This is the
+opcode physically located after the the current one, as determined by
+the size of the current opcode. This is often useful, such as when
+dumping the structure we use this order to traverse. Sometimes the code
+assumes that the "next regnode" is the same as the "next opcode", or in
+other words assumes that the sizeof a given opcode type is always going
+to be 1 regnode large.
+
+=item *
+
+There is the "regnext" from a given opcode. This is the opcode which
+is reached by jumping forward by the value of NEXT_OFF(),
+or in a few cases for longer jumps by the arg1 field of the regnode_1
+structure. The subroutine regnext() handles this transparently.
+This is the logical successor of the node, which in some cases, like
+that of the BRANCH opcode, has special meaning.
+
+=back
+
+=head1 PROCESS OVERVIEW
+
+Broadly speaking performing a match of a string against a pattern
+involves the following steps
+
+    A. Compilation
+        1. Parsing for size
+        2. Parsing for construction
+        3. Peep-hole Optimisation and Analysis
+    B. Execution
+        4. Start position and no-match optimisations
+        5. Program execution
+
+Where these steps occur in the actual execution of a perl program is
+determined by whether the pattern involves interpolating any string
+variables. If it does then compilation happens at run time. If it
+doesn't then it happens at compile time. (The C</o> modifier changes this,
+as does C<qr//> to a certain extent.) The engine doesn't really care that
+much.
+
+=head2 Compilation
+
+This code exists primarily in regcomp.c, along with the header files
+regcomp.h, regexp.h, regnodes.h.
+
+Compilation starts with C<pregcomp()>, which is mostly an initialization
+wrapper which farms out two other routines for the heavy lifting. The
+first being C<reg()> which is the start point for parsing, and
+C<study_chunk()> which is responsible for optimisation.
+
+Initialization in C<pregcomp()> mostly involves the creation and data
+filling of a special structure RExC_state_t, (defined in regcomp.c).
+Almost all internally used routines in regcomp.h take a pointer to one
+of these structures as their first argument, with the name *pRExC_state.
+This structure is used to store the compilation state and contains many
+fields. Likewise their are many macros defined which operate on this
+variable. Anything that looks like RExC_xxxx is a macro that operates on
+this pointer/structure.
+
+=head3 Parsing for size
+
+In this pass the input pattern is parsed in order to calculate how much
+space is needed for each opcode we would need to emit. The size is also
+used to determine whether long jumps will be required in the program.
+
+This stage is controlled by the macro SIZE_ONLY being set.
+
+The parse procedes pretty much exactly as it does during the
+construction phase except that most routines are shortcircuited to
+change the size field RExC_size and not actually do anything.
+
+=head3 Parsing for construcution
+
+Once the size of the program has been determine the pattern is parsed
+again, but this time for real. Now SIZE_ONLY will be false, and the
+actual construction can occur.
+
+C<reg()> is the start of the parse process. It is responsible for
+parsing an arbitrary chunk of pattern up to either the end of the
+string, or the first closing parenthesis it encounters in the pattern.
+This means it can be used to parse the toplevel regex, or any section
+inside of a grouping parenthesis. It also handles the "special parens"
+that perls regexes have. For instance when parsing C</x(?:foo)y/> C<reg()>
+will at one point be called to parse from the '?' symbol up to and
+including the ')'.
+
+Additionally C<reg()> is responsible for parsing the one or more
+branches from the pattern, and for "finishing them off" by correctly
+setting their next pointers. In order to do the parsing it repeatedly
+calls out to C<regbranch()> which is responsible for handling up to the
+first C<|> symbol it sees.
+
+C<regbranch()> in turn calls C<regpiece()> which is responsible for
+handling "things" followed by a quantifier. In order to parse the
+"things" C<regatom()> is called. This is the lowest level routine which
+is responsible for parsing out constant strings, char classes, and the
+various special symbols like C<$>. If C<regatom()> encounters a '('
+character it in turn calls C<reg()>.
+
+The routine C<regtail()> is called by both C<reg()>, C<regbranch()>
+in order to "set the tail pointer" correctly. When executing and
+we get to the end of a branch we need to go to node following the
+grouping parens. When parsing however we don't know where the end will
+be until we get there, so when we do we must go back and update the
+offsets as appropriate. C<regtail> is used to make this easier.
+
+A subtlety of the parse process means that a regex like C</foo/> is
+originally parsed into an alternation with a single branch. It is only
+afterwards that the optimizer converts single branch alternations into the
+simpler form.
+
+=head3 Parse Call Graph and a Grammar
+
+The call graph looks like this:
+
+    reg()                        # parse a top level regex, or inside of parens
+        regbranch()              # parse a single branch of an alternation
+            regpiece()           # parse a pattern followed by a quantifier
+                regatom()        # parse a simple pattern
+                    regclass()   #   used to handle a class
+                    reg()        #   used to handle a parenthesized subpattern
+                    ....
+            ...
+            regtail()            # finish off the branch
+        ...
+        regtail()                # finish off the branch sequence. Tie each
+                                 # branches tail to the tail of the sequence
+                                 # (NEW) In Debug mode this is
+                                 # regtail_study().
+
+A grammar form might be something like this:
+
+    atom  : constant | class
+    quant : '*' | '+' | '?' | '{min,max}'
+    _branch: piece
+           | piece _branch
+           | nothing
+    branch: _branch
+          | _branch '|' branch
+    group : '(' branch ')'
+    _piece: atom | group
+    piece : _piece
+          | _piece quant
+
+=head3 Debug Output
+
+In bleadperl you can C<< use re Debug => 'PARSE'; >> to see some trace
+information about the parse process. We will start with some simple
+patterns and build up to more complex patterns.
+
+So when we parse C</foo/> we see something like the following table. The
+left shows whats being parsed, the number indicates where the next regop
+would go. The stuff on the right is the trace output of the graph. The
+names are chosen to be short to make it less dense on the screen. 'tsdy'
+is a special form of C<regtail()> which does some extra analysis.
+
+ >foo<             1            reg
+                                  brnc
+                                    piec
+                                      atom
+ ><                4              tsdy~ EXACT <foo> (EXACT) (1)
+                                      ~ attach to END (3) offset to 2
+
+The resulting program then looks like:
+
+   1: EXACT <foo>(3)
+   3: END(0)
+
+As you can see, even though we parsed out a branch and a piece, it was ultimately
+only an atom. The final program shows us how things work. We have an EXACT regop,
+followed by an END regop. The number in parens indicates where the 'regnext' of
+the node goes. The 'regnext' of an END regop is unused, as END regops mean
+we have successfully matched. The number on the left indicates the position of
+the regop in the regnode array.
+
+Now lets try a harder pattern. We will add a quantifier so we have the pattern
+C</foo+/>. We will see that C<regbranch()> calls C<regpiece()> regpiece twice.
+
+ >foo+<            1            reg
+                                  brnc
+                                    piec
+                                      atom
+ >o+<              3                piec
+                                      atom
+ ><                6                tail~ EXACT <fo> (1)
+                   7              tsdy~ EXACT <fo> (EXACT) (1)
+                                      ~ PLUS (END) (3)
+                                      ~ attach to END (6) offset to 3
+
+And we end up with the program:
+
+   1: EXACT <fo>(3)
+   3: PLUS(6)
+   4:   EXACT <o>(0)
+   6: END(0)
+
+Now we have a special case. The EXACT regop has a regnext of 0. This is
+because if it matches it should try to match itself again. The PLUS regop
+handles the actual failure of the EXACT regop and acts appropriately (going
+to regnode 6 if the EXACT matched at least once, or failing if it didn't.)
+
+Now for something much more complex: C</x(?:foo*|b[a][rR])(foo|bar)$/>
+
+ >x(?:foo*|b...    1            reg
+                                  brnc
+                                    piec
+                                      atom
+ >(?:foo*|b[...    3                piec
+                                      atom
+ >?:foo*|b[a...                         reg
+ >foo*|b[a][...                           brnc
+                                            piec
+                                              atom
+ >o*|b[a][rR...    5                        piec
+                                              atom
+ >|b[a][rR])...    8                        tail~ EXACT <fo> (3)
+ >b[a][rR])(...    9                      brnc
+                  10                        piec
+                                              atom
+ >[a][rR])(f...   12                        piec
+                                              atom
+ >a][rR])(fo...                                 clas
+ >[rR])(foo|...   14                        tail~ EXACT <b> (10)
+                                            piec
+                                              atom
+ >rR])(foo|b...                                 clas
+ >)(foo|bar)...   25                        tail~ EXACT <a> (12)
+                                          tail~ BRANCH (3)
+                  26                      tsdy~ BRANCH (END) (9)
+                                              ~ attach to TAIL (25) offset to 16
+                                          tsdy~ EXACT <fo> (EXACT) (4)
+                                              ~ STAR (END) (6)
+                                              ~ attach to TAIL (25) offset to 19
+                                          tsdy~ EXACT <b> (EXACT) (10)
+                                              ~ EXACT <a> (EXACT) (12)
+                                              ~ ANYOF[Rr] (END) (14)
+                                              ~ attach to TAIL (25) offset to 11
+ >(foo|bar)$<                       tail~ EXACT <x> (1)
+                                    piec
+                                      atom
+ >foo|bar)$<                            reg
+                  28                      brnc
+                                            piec
+                                              atom
+ >|bar)$<         31                      tail~ OPEN1 (26)
+ >bar)$<                                  brnc
+                  32                        piec
+                                              atom
+ >)$<             34                      tail~ BRANCH (28)
+                  36                      tsdy~ BRANCH (END) (31)
+                                              ~ attach to CLOSE1 (34) offset to 3
+                                          tsdy~ EXACT <foo> (EXACT) (29)
+                                              ~ attach to CLOSE1 (34) offset to 5
+                                          tsdy~ EXACT <bar> (EXACT) (32)
+                                              ~ attach to CLOSE1 (34) offset to 2
+ >$<                                tail~ BRANCH (3)
+                                        ~ BRANCH (9)
+                                        ~ TAIL (25)
+                                    piec
+                                      atom
+ ><               37                tail~ OPEN1 (26)
+                                        ~ BRANCH (28)
+                                        ~ BRANCH (31)
+                                        ~ CLOSE1 (34)
+                  38              tsdy~ EXACT <x> (EXACT) (1)
+                                      ~ BRANCH (END) (3)
+                                      ~ BRANCH (END) (9)
+                                      ~ TAIL (END) (25)
+                                      ~ OPEN1 (END) (26)
+                                      ~ BRANCH (END) (28)
+                                      ~ BRANCH (END) (31)
+                                      ~ CLOSE1 (END) (34)
+                                      ~ EOL (END) (36)
+                                      ~ attach to END (37) offset to 1<div></div>
+
+Resulting in the program
+
+   1: EXACT <x>(3)
+   3: BRANCH(9)
+   4:   EXACT <fo>(6)
+   6:   STAR(26)
+   7:     EXACT <o>(0)
+   9: BRANCH(25)
+  10:   EXACT <ba>(14)
+  12:   OPTIMIZED (2 nodes)
+  14:   ANYOF[Rr](26)
+  25: TAIL(26)
+  26: OPEN1(28)
+  28:   TRIE-EXACT(34)
+        [StS:1 Wds:2 Cs:6 Uq:5 #Sts:7 Mn:3 Mx:3 Stcls:bf]
+          <foo>
+          <bar>
+  30:   OPTIMIZED (4 nodes)
+  34: CLOSE1(36)
+  36: EOL(37)
+  37: END(0)
+
+Here we can see a much more complex program, with various optimisations in
+play. At regnode 10 we can see an example where a char class with only
+one character in it was turned into an EXACT node. We can also see where
+an entire alternation was turned into a TRIE-EXACT node. As a consequence
+some of the regnodes have been marked as optimised away. We can see that
+the C<$> symbol has been converted into an EOL regop, a special piece of
+code that looks for \n or the end of a string.
+
+The next pointer for BRANCHes is interesting in that it points at where
+execution should go if the branch fails. When executing if the engine
+tries to traverse from a branch to a regnext that isnt a branch then
+the engine will know the overall series of branches have failed.
+
+=head3 Peep-hole Optimisation and Analysis
+
+The regular expression engine can be a weighty tool to wield. On long
+strings and complex patterns it can end up having to do a lot of work
+to find a match, and even more to decide that no match is possible.
+Consider a situation like the following pattern.
+
+   'ababababababababababab' =~ /(a|b)*z/
+
+The C<(a|b)*> part can match at every char in the string, and then fail
+every time because there is no C<z> in the string. So obviously we can
+not bother to use the regex engine unless there is a 'z' in the string.
+Likewise in a pattern like:
+
+   /foo(\w+)bar/
+
+In this case we know that the string must contain a C<foo> which must be
+followed by C<bar>. We can use Fast Boyer-More matching as implemented
+in fbm_instr() to find the location of these strings. If they dont exist
+then we dont need to resort to the much more expensive regex engine.
+Even better if they do exist then we can use their positions to
+reduce the search space that the regex engine needs to cover to determine
+if the entire pattern does match.
+
+There are various aspects of the pattern that can be used to facilitate
+optimisations along these lines:
+
+    * anchored fixed strings
+    * floating fixed strings
+    * minimum and maximum length requirements
+    * start class
+    * Beginning/End of line positions
+
+Another form of optimisation that can occur is post-parse "peep-hole"
+optimisations, where inefficient constructs are modified so that they
+are more efficient. An example of this is TAIL regops which are used
+during parsing to mark the end of branches and the end of groups. These
+regops are used as place holders during construction and "always match"
+so they can be "optimised away" by making the things that point to the
+TAIL point to thing the TAIL points to, in essence "skipping" the node.
+
+Another optimisation that can occur is that of "EXACT merging" which is
+where two consecutive EXACT nodes are merged into a single more efficient
+to execute regop. An even more agressive form of this is that a branch
+sequence of the form EXACT BRANCH ... EXACT can be converted into a TRIE
+regop.
+
+All of this occurs in the routine study_chunk() which uses a special
+structure scan_data_t to store the analysis that it has performed, and
+as it goes does the "peep-hole" optimisations.
+
+The code involved in study_chunk() is extremely cryptic. Be careful. :-)
+
+=head2 Execution
+
+Execution of a regex generally involves two phases, the first being
+finding the start point in the string where we should match from,
+and the second being running the regop interpreter.
+
+If we can tell that there is no valid start point we don't bother running
+interpreter at all. Likewise if we know from the analysis phase that we
+can not optimise detection of the start position we go straight to the
+interpreter.
+
+The two entry points are re_intuit_start() and pregexec(). These routines
+have a somewhat incestuous relationship with overlap between their functions,
+and pregexec() may even call re_intuit_start() on its own. Nevertheless
+the perl source code may call into either, or both.
+
+Execution of the interpreter itself used to be recursive. Due to the
+efforts of Dave Mitchel in blead perl it no longer is. Instead an
+internal stack is maintained on the heap and the routine is fully
+iterative. This can make it tricky as the code is quite conservative
+about what state it stores which means that two consecutive lines in the
+code can actually be running in totally different contexts due to the
+simulated recursion.
+
+=head3 Start position and no-match optimisations
+
+re_intuit_start() is responsible for handling start point and no match
+optimisations as determined by the results of the analysis done by
+study_chunk() (and described in L<Peep-hole Optimisation and Analysis>).
+
+The basic structure of this routine is to try to find the start and/or
+end points of where the pattern could match, and to ensure that the string
+is long enough to match the pattern. It tries to use more efficent
+methods over less efficient methods and may involve considerable cross
+checking of constraints to find the place in the string that matches.
+For instance it may try to determine that a given fixed string must be
+not only present but a certain number of chars before the end of the
+string, or whatever.
+
+It calls out into several other routines, like fbm_instr() which does
+"Fast Boyer More" matching and find_byclass() which is responsible for
+finding the start using the first mandatory regop in the program.
+
+When the optimisation criteria have been satisfied reg_try() is called
+to perform the match.
+
+=head3 Program execution
+
+C<pregexec()> is the main entry point for running a regex. It contains
+support for initializing the regex interpreters state, running
+re_intuit_start() if needed, and running the intepreter on the string
+from various start positions as needed. When its necessary to use
+the regex interpreter C<pregexec()> calls C<regtry()>.
+
+C<regtry()> is the entry point into the regex interpreter. It expects
+as arguments a pointer to a regmatch_info structure and a pointer to
+a string.  It returns an integer 1 for success and a 0 for failure.
+It is basically a setup wrapper around C<regmatch()>.
+
+C<regmatch> is the main "recursive loop" of the interpreter. It is
+basically a giant switch statement that executes the regops based on
+their type. A few of the regops are implemented as subroutines but
+the bulk are inline code.
+
+=head1 MISCELLANEOUS
+
+=head2 UNICODE and Localization Support
+
+No matter how you look at it unicode support is going to be a pain in a
+regex engine. Tricks that might be fine when you have 256 possible
+characters often won't scale to handle the size of the 'utf8' character
+set.  Things you can take for granted with ASCII may not be true with
+unicode. For instance in ASCII its safe to assume that
+C<sizeof(char1) == sizeof(char2)>, in utf8 it isn't. Unicode case folding is
+vastly more complex than the simple rules of English, and even when not
+using unicode but only localized single byte encodings things can get
+tricky (technically GERMAN-SHARP-ESS should match 'ss' in localized case
+insensitive matching.)
+
+Making things worse is that C<utf8> support was a later addition to the
+regex engine (as it was to perl) and necessarily this made things a lot
+more complicated. Obviously its easier to design a regex engine with
+unicode support from the beginning than it is to retrofit one that
+wasn't designed with it in mind.
+
+Pretty well every regop that involves looking at the input string has
+two cases, one for 'utf8' and one not. In fact its often more complex
+than that, as the pattern may be 'utf8' as well.
+
+Care must be taken when making changes to make sure that you handle
+utf8 properly both at compile time and at execution time, including
+when the string and pattern are mismatched.
+
+The following comment in regcomp.h gives an example of exactly how
+tricky this can be:
+
+    Two problematic code points in Unicode casefolding of EXACT nodes:
+
+    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+
+    which casefold to
+
+    Unicode                      UTF-8
+
+    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
+    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
+
+    This means that in case-insensitive matching (or "loose matching",
+    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded
+    byte length of the above casefolded versions) can match a target
+    string of length two (the byte length of UTF-8 encoded U+0390 or
+    U+03B0). This would rather mess up the minimum length computation.
+
+    What we'll do is to look for the tail four bytes, and then peek
+    at the preceding two bytes to see whether we need to decrease
+    the minimum length by four (six minus two).
+
+    Thanks to the design of UTF-8, there cannot be false matches:
+    A sequence of valid UTF-8 bytes cannot be a subsequence of
+    another valid sequence of UTF-8 bytes.
+
+=head1 AUTHOR
+
+by Yves Orton, 2006.
+
+With excerpts from Perl, and contributions and suggestions from
+Ronald J. Kimball, Dave Mitchell, Dominic Dunlop, Mark Jason Dominus,
+and Stephen McCamant.
+
+=head1 LICENSE
+
+Same terms as Perl.
+
+=head1 REFERENCES
+
+[1] http://perl.plover.com/Rx/paper/
+
+=cut
diff --git a/pod/perltoc.pod b/pod/perltoc.pod

index f82a7c3..679ae32 100644 (file)
--- a/pod/perltoc.pod
+++ b/pod/perltoc.pod
@@ -3845,6 +3845,34 @@ classes, Use of "Is" Prefix, Blocks
  
  =back
  
+=head2 perlunitut - Perl Unicode Tutorial
+
+=over 4
+
+=item DESCRIPTION
+
+=over 4
+
+=item Definitions
+
+=item Your new toolkit
+
+=item I/O flow (the actual 5 minute tutorial)
+
+=item Q and A
+
+=back
+
+=item SUMMARY
+
+=item ACKNOWLEDGEMENTS
+
+=item AUTHOR
+
+=item SEE ALSO
+
+=back
+
  =head2 perlebcdic - Considerations for running Perl on EBCDIC platforms
  
  =over 4
@@ -5260,6 +5288,53 @@ callback
  
  =back
  
+=head2 perlreguts - Description of the Perl regular expression engine.
+
+=over 4
+
+=item DESCRIPTION
+
+=item OVERVIEW
+
+=over 4
+
+=item A quick note on terms
+
+=item What is a regular expression engine?
+
+=item Structure of a Regexp Program
+
+regnode_1, regnode_2, regnode_string, regnode_charclass,
+regnode_charclass_class
+
+=back
+
+=item PROCESS OVERVIEW
+
+=over 4
+
+=item Compilation
+
+=item Execution
+
+=back
+
+=item MISCELLANEOUS
+
+=over 4
+
+=item UNICODE and Localization Support
+
+=back
+
+=item AUTHOR
+
+=item LICENSE
+
+=item REFERENCES
+
+=back
+
  =head2 perlapi - autogenerated documentation for the perl public API
  
  =over 4
@@ -5380,7 +5455,8 @@ PoisonNew X<PoisonNew>, PoisonWith X<PoisonWith>, Renew X<Renew>, Renewc
  X<Renewc>, Safefree X<Safefree>, savepv X<savepv>, savepvn X<savepvn>,
  savepvs X<savepvs>, savesharedpv X<savesharedpv>, savesharedpvn
  X<savesharedpvn>, savesvpv X<savesvpv>, StructCopy X<StructCopy>, Zero
-X<Zero>, ZeroD X<ZeroD>
+savepvs X<savepvs>, savesharedpv X<savesharedpv>, savesvpv X<savesvpv>,
+StructCopy X<StructCopy>, Zero X<Zero>, ZeroD X<ZeroD>
  
  =item Miscellaneous Functions
  
@@ -10130,6 +10206,16 @@ threads->is_detached()
  
  =back
  
+=item THREAD CONTEXT
+
+=over 4
+
+=item Explicit context
+
+=item Implicit context
+
+=back
+
  =item THREAD STACK SIZE
  
  threads->get_stack_size();, $size = $thr->get_stack_size();, $old_size =
@@ -11227,7 +11313,7 @@ redoop, nextop, lastop
  
  =item B::COP Methods
  
-label, stash, stashpv, file, cop_seq, arybase, line, warnings, io
+label, stash, stashpv, file, cop_seq, arybase, line, warnings, io, hints
  
  =back
  
@@ -11425,10 +11511,11 @@ B<~>
  
  B<#>I<var>, B<#>I<var>I<N>, B<#>I<Var>, B<#addr>, B<#arg>, B<#class>,
  B<#classsym>, B<#coplabel>, B<#exname>, B<#extarg>, B<#firstaddr>,
-B<#flags>, B<#flagval>, B<#hyphseq>, B<#label>, B<#lastaddr>, B<#name>,
-B<#NAME>, B<#next>, B<#nextaddr>, B<#noise>, B<#private>, B<#privval>,
-B<#seq>, B<#seqnum>, B<#opt>, B<#static>, B<#sibaddr>, B<#svaddr>,
-B<#svclass>, B<#svval>, B<#targ>, B<#targarg>, B<#targarglife>, B<#typenum>
+B<#flags>, B<#flagval>, B<#hints>, B<#hintsval>, B<#hyphseq>, B<#label>,
+B<#lastaddr>, B<#name>, B<#NAME>, B<#next>, B<#nextaddr>, B<#noise>,
+B<#private>, B<#privval>, B<#seq>, B<#seqnum>, B<#opt>, B<#static>,
+B<#sibaddr>, B<#svaddr>, B<#svclass>, B<#svval>, B<#targ>, B<#targarg>,
+B<#targarglife>, B<#typenum>
  
  =back
  
@@ -11536,8 +11623,8 @@ strict, $[, bytes, utf8, integer, re, warnings, hint_bits, warning_bits
  
  =item OPTIONS AND LINT CHECKS
  
-B<context>, B<implicit-read> and B<implicit-write>, B<bare-subs>,
-B<dollar-underscore>, B<private-names>, B<undefined-subs>,
+B<magic-diamond>, B<context>, B<implicit-read> and B<implicit-write>,
+B<bare-subs>, B<dollar-underscore>, B<private-names>, B<undefined-subs>,
  B<regexp-variables>, B<all>, B<none>
  
  =item NON LINT-CHECK OPTIONS
@@ -11546,6 +11633,11 @@ B<-u Package>
  
  =item EXTENDING LINT
  
+=item TODO
+
+while(<FH>) stomps $_, strict oo, unchecked system calls, more tests,
+validate against older perls
+
  =item BUGS
  
  =item AUTHOR
@@ -12517,32 +12609,36 @@ C<d_ctermid_r>, C<d_ctime_r>, C<d_cuserid>, C<d_dbl_dig>,
  C<d_dbminitproto>, C<d_difftime>, C<d_dirfd>, C<d_dirnamlen>, C<d_dlerror>,
  C<d_dlopen>, C<d_dlsymun>, C<d_dosuid>, C<d_drand48_r>, C<d_drand48proto>,
  C<d_dup2>, C<d_eaccess>, C<d_endgrent>, C<d_endgrent_r>, C<d_endhent>,
-C<d_endhostent_r>, C<d_endnent>, C<d_endnetent_r>, C<d_endpent>,
-C<d_endprotoent_r>, C<d_endpwent>, C<d_endpwent_r>, C<d_endsent>,
-C<d_endservent_r>, C<d_eofnblk>, C<d_eunice>, C<d_faststdio>, C<d_fchdir>,
-C<d_fchmod>, C<d_fchown>, C<d_fcntl>, C<d_fcntl_can_lock>, C<d_fd_macros>,
-C<d_fd_set>, C<d_fds_bits>, C<d_fgetpos>, C<d_finite>, C<d_finitel>,
-C<d_flexfnam>, C<d_flock>, C<d_flockproto>, C<d_fork>, C<d_fp_class>,
-C<d_fpathconf>, C<d_fpclass>, C<d_fpclassify>, C<d_fpclassl>,
-C<d_fpos64_t>, C<d_frexpl>, C<d_fs_data_s>, C<d_fseeko>, C<d_fsetpos>,
-C<d_fstatfs>, C<d_fstatvfs>, C<d_fsync>, C<d_ftello>, C<d_ftime>,
-C<d_futimes>, C<d_Gconvert>, C<d_getcwd>, C<d_getespwnam>, C<d_getfsstat>,
-C<d_getgrent>, C<d_getgrent_r>, C<d_getgrgid_r>, C<d_getgrnam_r>,
-C<d_getgrps>, C<d_gethbyaddr>, C<d_gethbyname>, C<d_gethent>,
-C<d_gethname>, C<d_gethostbyaddr_r>, C<d_gethostbyname_r>,
-C<d_gethostent_r>, C<d_gethostprotos>, C<d_getitimer>, C<d_getlogin>,
-C<d_getlogin_r>, C<d_getmnt>, C<d_getmntent>, C<d_getnbyaddr>,
-C<d_getnbyname>, C<d_getnent>, C<d_getnetbyaddr_r>, C<d_getnetbyname_r>,
-C<d_getnetent_r>, C<d_getnetprotos>, C<d_getpagsz>, C<d_getpbyname>,
-C<d_getpbynumber>, C<d_getpent>, C<d_getpgid>, C<d_getpgrp>, C<d_getpgrp2>,
-C<d_getppid>, C<d_getprior>, C<d_getprotobyname_r>,
-C<d_getprotobynumber_r>, C<d_getprotoent_r>, C<d_getprotoprotos>,
-C<d_getprpwnam>, C<d_getpwent>, C<d_getpwent_r>, C<d_getpwnam_r>,
-C<d_getpwuid_r>, C<d_getsbyname>, C<d_getsbyport>, C<d_getsent>,
-C<d_getservbyname_r>, C<d_getservbyport_r>, C<d_getservent_r>,
-C<d_getservprotos>, C<d_getspnam>, C<d_getspnam_r>, C<d_gettimeod>,
-C<d_gmtime_r>, C<d_gnulibc>, C<d_grpasswd>, C<d_hasmntopt>, C<d_htonl>,
-C<d_ilogbl>, C<d_inc_version_list>, C<d_index>, C<d_inetaton>,
+C<d_copysignl>, C<d_crypt>, C<d_crypt_r>, C<d_csh>, C<d_ctermid_r>,
+C<d_ctime_r>, C<d_cuserid>, C<d_dbl_dig>, C<d_dbminitproto>, C<d_difftime>,
+C<d_dirfd>, C<d_dirnamlen>, C<d_dlerror>, C<d_dlopen>, C<d_dlsymun>,
+C<d_dosuid>, C<d_drand48_r>, C<d_drand48proto>, C<d_dup2>, C<d_eaccess>,
+C<d_endgrent>, C<d_endgrent_r>, C<d_endhent>, C<d_endhostent_r>,
+C<d_endnent>, C<d_endnetent_r>, C<d_endpent>, C<d_endprotoent_r>,
+C<d_endpwent>, C<d_endpwent_r>, C<d_endsent>, C<d_endservent_r>,
+C<d_eofnblk>, C<d_eunice>, C<d_faststdio>, C<d_fchdir>, C<d_fchmod>,
+C<d_fchown>, C<d_fcntl>, C<d_fcntl_can_lock>, C<d_fd_macros>, C<d_fd_set>,
+C<d_fds_bits>, C<d_fgetpos>, C<d_finite>, C<d_finitel>, C<d_flexfnam>,
+C<d_flock>, C<d_flockproto>, C<d_fork>, C<d_fp_class>, C<d_fpathconf>,
+C<d_fpclass>, C<d_fpclassify>, C<d_fpclassl>, C<d_fpos64_t>, C<d_frexpl>,
+C<d_fs_data_s>, C<d_fseeko>, C<d_fsetpos>, C<d_fstatfs>, C<d_fstatvfs>,
+C<d_fsync>, C<d_ftello>, C<d_ftime>, C<d_futimes>, C<d_Gconvert>,
+C<d_getcwd>, C<d_getespwnam>, C<d_getfsstat>, C<d_getgrent>,
+C<d_getgrent_r>, C<d_getgrgid_r>, C<d_getgrnam_r>, C<d_getgrps>,
+C<d_gethbyaddr>, C<d_gethbyname>, C<d_gethent>, C<d_gethname>,
+C<d_gethostbyaddr_r>, C<d_gethostbyname_r>, C<d_gethostent_r>,
+C<d_gethostprotos>, C<d_getitimer>, C<d_getlogin>, C<d_getlogin_r>,
+C<d_getmnt>, C<d_getmntent>, C<d_getnbyaddr>, C<d_getnbyname>,
+C<d_getnent>, C<d_getnetbyaddr_r>, C<d_getnetbyname_r>, C<d_getnetent_r>,
+C<d_getnetprotos>, C<d_getpagsz>, C<d_getpbyname>, C<d_getpbynumber>,
+C<d_getpent>, C<d_getpgid>, C<d_getpgrp>, C<d_getpgrp2>, C<d_getppid>,
+C<d_getprior>, C<d_getprotobyname_r>, C<d_getprotobynumber_r>,
+C<d_getprotoent_r>, C<d_getprotoprotos>, C<d_getprpwnam>, C<d_getpwent>,
+C<d_getpwent_r>, C<d_getpwnam_r>, C<d_getpwuid_r>, C<d_getsbyname>,
+C<d_getsbyport>, C<d_getsent>, C<d_getservbyname_r>, C<d_getservbyport_r>,
+C<d_getservent_r>, C<d_getservprotos>, C<d_getspnam>, C<d_getspnam_r>,
+C<d_gettimeod>, C<d_gmtime_r>, C<d_gnulibc>, C<d_grpasswd>, C<d_hasmntopt>,
+C<d_htonl>, C<d_ilogbl>, C<d_inc_version_list>, C<d_index>, C<d_inetaton>,
  C<d_int64_t>, C<d_isascii>, C<d_isfinite>, C<d_isinf>, C<d_isnan>,
  C<d_isnanl>, C<d_killpg>, C<d_lchown>, C<d_ldbl_dig>,
  C<d_libm_lib_version>, C<d_link>, C<d_localtime_r>,
@@ -17542,6 +17638,10 @@ handling
  
  =item DESCRIPTION
  
+=item See Also
+
+L<IPC::Open2>, L<IPC::Run>
+
  =item WARNING
  
  =back
diff --git a/pod/perlunitut.pod b/pod/perlunitut.pod

new file mode 100644 (file)

index 0000000..ae8d0b1
--- /dev/null
+++ b/pod/perlunitut.pod
@@ -0,0 +1,425 @@
+=head1 NAME
+
+perlunitut - Perl Unicode Tutorial
+
+=head1 DESCRIPTION
+
+The days of just flinging strings around are over. It's well established that
+modern programs need to be capable of communicating funny accented letters, and
+things like euro symbols. This means that programmers need new habits. It's
+easy to program Unicode capable software, but it does require discipline to do
+it right.
+
+There's a lot to know about character sets, and text encodings. It's probably
+best to spend a full day learning all this, but the basics can be learned in
+minutes. 
+
+These are not the very basics, though. It is assumed that you already
+know the difference between bytes and characters, and realise (and accept!)
+that there are many different character sets and encodings, and that your
+program has to be explicit about them. Recommended reading is "The Absolute
+Minimum Every Software Developer Absolutely, Positively Must Know About Unicode
+and Character Sets (No Excuses!)" by Joel Spolsky, at
+L<http://joelonsoftware.com/articles/Unicode.html>.
+
+This tutorial speaks in rather absolute terms, and provides only a limited view
+of the wealth of character string related features that Perl has to offer. For
+most projects, this information will probably suffice.
+
+=head2 Definitions
+
+It's important to set a few things straight first. This is the most important
+part of this tutorial. This view may conflict with other information that you
+may have found on the web, but that's mostly because many sources are wrong.
+
+You may have to re-read this entire section a few times...
+
+=head3 Unicode
+
+B<Unicode> is a character set with room for lots of characters. The ordinal
+value of a character is called a B<code point>. 
+
+There are many, many code points, but computers work with bytes, and a byte can
+have only 256 values. Unicode has many more characters, so you need a method
+to make these accessible.
+
+Unicode is encoded using several competing encodings, of which UTF-8 is the
+most used. In a Unicode encoding, multiple subsequent bytes can be used to
+store a single code point, or simply: character.
+
+=head3 UTF-8
+
+B<UTF-8> is a Unicode encoding. Many people think that Unicode and UTF-8 are
+the same thing, but they're not. There are more Unicode encodings, but much of
+the world has standardized on UTF-8. 
+
+UTF-8 treats the first 128 codepoints, 0..127, the same as ASCII. They take
+only one byte per character. All other characters are encoded as two or more
+(up to six) bytes using a complex scheme. Fortunately, Perl handles this for
+us, so we don't have to worry about this.
+
+=head3 Text strings (character strings)
+
+B<Text strings>, or B<character strings> are made of characters. Bytes are
+irrelevant here, and so are encodings. Each character is just that: the
+character.
+
+On a text string, you would do things like:
+
+    $text =~ s/foo/bar/;
+    if ($string =~ /^\d+$/) { ... }
+    $text = ucfirst $text;
+    my $character_count = length $text;
+
+The value of a character (C<ord>, C<chr>) is the corresponding Unicode code
+point.
+
+=head3 Binary strings (byte strings)
+
+B<Binary strings>, or B<byte strings> are made of bytes. Here, you don't have
+characters, just bytes. All communication with the outside world (anything
+outside of your current Perl process) is done in binary.
+
+On a binary string, you would do things like:
+
+    my (@length_content) = unpack "(V/a)*", $binary;
+    $binary =~ s/\x00\x0F/\xFF\xF0/;  # for the brave :)
+    print {$fh} $binary;
+    my $byte_count = length $binary;
+
+=head3 Encoding
+
+B<Encoding> (as a verb) is the conversion from I<text> to I<binary>. To encode,
+you have to supply the target encoding, for example C<iso-8859-1> or C<UTF-8>.
+Some encodings, like the C<iso-8859> ("latin") range, do not support the full
+Unicode standard; characters that can't be represented are lost in the
+conversion.
+
+=head3 Decoding
+
+B<Decoding> is the conversion from I<binary> to I<text>. To decode, you have to
+know what encoding was used during the encoding phase. And most of all, it must
+be something decodable. It doesn't make much sense to decode a PNG image into a
+text string.
+
+=head3 Internal format
+
+Perl has an B<internal format>, an encoding that it uses to encode text strings
+so it can store them in memory. All text strings are in this internal format.
+In fact, text strings are never in any other format!
+
+You shouldn't worry about what this format is, because conversion is
+automatically done when you decode or encode.
+
+=head2 Your new toolkit
+
+Add to your standard heading the following line:
+
+    use Encode qw(encode decode);
+
+Or, if you're lazy, just:
+
+    use Encode;
+
+=head2 I/O flow (the actual 5 minute tutorial)
+
+The typical input/output flow of a program is:
+
+    1. Receive and decode
+    2. Process
+    3. Encode and output
+
+If your input is binary, and is supposed to remain binary, you shouldn't decode
+it to a text string, of course. But in all other cases, you should decode it.
+
+Decoding can't happen reliably if you don't know how the data was encoded. If
+you get to choose, it's a good idea to standardize on UTF-8.
+
+    my $foo   = decode('UTF-8', get 'http://example.com/');
+    my $bar   = decode('ISO-8859-1', readline STDIN);
+    my $xyzzy = decode('Windows-1251', $cgi->param('foo'));
+
+Processing happens as you knew before. The only difference is that you're now
+using characters instead of bytes. That's very useful if you use things like
+C<substr>, or C<length>.
+
+It's important to realize that there are no bytes in a text string. Of course,
+Perl has its internal encoding to store the string in memory, but ignore that.
+If you have to do anything with the number of bytes, it's probably best to move
+that part to step 3, just after you've encoded the string. Then you know
+exactly how many bytes it will be in the destination string.
+
+The syntax for encoding text strings to binary strings is as simple as decoding:
+
+    $body = encode('UTF-8', $body);
+
+If you needed to know the length of the string in bytes, now's the perfect time
+for that. Because C<$body> is now a byte string, C<length> will report the
+number of bytes, instead of the number of characters. The number of
+characters is no longer known, because characters only exist in text strings.
+
+    my $byte_count = length $body;
+
+And if the protocol you're using supports a way of letting the recipient know
+which character encoding you used, please help the receiving end by using that
+feature! For example, E-mail and HTTP support MIME headers, so you can use the
+C<Content-Type> header. They can also have C<Content-Length> to indicate the
+number of I<bytes>, which is always a good idea to supply if the number is
+known.
+
+    "Content-Type: text/plain; charset=UTF-8",
+    "Content-Length: $byte_count"
+
+=head2 Q and A
+
+=head3 This isn't really a Unicode tutorial, is it?
+
+No, Perl has an abstracted interface for all supported character encodings, so
+this is actually a generic C<Encode> tutorial. But many people think that
+Unicode is special and magical, and I didn't want to disappoint them, so I
+decided to call this document a Unicode tutorial.
+
+=head3 What about binary data, like images?
+
+Well, apart from a bare C<binmode $fh>, you shouldn't treat them specially.
+(The binmode is needed because otherwise Perl may convert line endings on Win32
+systems.)
+
+Be careful, though, to never combine text strings with binary strings. If you
+need text in a binary stream, encode your text strings first using the
+appropriate encoding, then join them with binary strings. See also: "What if I
+don't encode?".
+
+=head3 What about the UTF-8 flag?
+
+Please, unless you're hacking the internals, or debugging weirdness, don't
+think about the UTF-8 flag at all. That means that you very probably shouldn't
+use C<is_utf8>, C<_utf8_on> or C<_utf8_off> at all.
+
+Perl's internal format happens to be UTF-8. Unfortunately, Perl can't keep a
+secret, so everyone knows about this.  That is the source of much confusion.
+It's better to pretend that the internal format is some unknown encoding,
+and that you always have to encode and decode explicitly.
+
+=head3 When should I decode or encode?
+
+Whenever you're communicating with anything that is external to your perl
+process, like a database, a text file, a socket, or another program. Even if
+the thing you're communicating with is also written in Perl.
+
+=head3 What if I don't decode?
+
+Whenever your encoded, binary string is used together with a text string, Perl
+will assume that your binary string was encoded with ISO-8859-1, also known as
+latin-1. If it wasn't latin-1, then your data is unpleasantly converted. For
+example, if it was UTF-8, the individual bytes of multibyte characters are seen
+as separate characters, and then again converted to UTF-8. Such double encoding
+can be compared to double HTML encoding (C<&amp;gt;>), or double URI encoding
+(C<%253E>).
+
+This silent implicit decoding is known as "upgrading". That may sound
+positive, but it's best to avoid it.
+
+=head3 What if I don't encode?
+
+Your text string will be sent using the bytes in Perl's internal format. In
+some cases, Perl will warn you that you're doing something wrong, with a
+friendly warning:
+
+    Wide character in print at example.pl line 2.
+
+Because the internal format is often UTF-8, these bugs are hard to spot,
+because UTF-8 is usually the encoding you wanted! But don't be lazy, and don't
+use the fact that Perl's internal format is UTF-8 to your advantage. Encode
+explicitly to avoid weird bugs, and to show to maintenance programmers that you
+thought this through.
+
+=head3 Is there a way to automatically decode or encode?
+
+If all data that comes from a certain handle is encoded in exactly the same
+way, you can tell the PerlIO system to automatically decode everything, with
+the C<encoding> layer. If you do this, you can't accidentally forget to decode
+or encode anymore, on things that use the layered handle.
+
+You can provide this layer when C<open>ing the file:
+
+    open my $fh, '>:encoding(UTF-8)', $filename;  # auto encoding on write
+    open my $fh, '<:encoding(UTF-8)', $filename;  # auto decoding on read
+
+Or if you already have an open filehandle:
+
+    binmode $fh, ':encoding(UTF-8)';
+
+Some database drivers for DBI can also automatically encode and decode, but
+that is typically limited to the UTF-8 encoding, because they cheat.
+
+=head3 Cheat?! Tell me, how can I cheat?
+
+Well, because Perl's internal format is UTF-8, you can just skip the encoding
+or decoding step, and manipulate the UTF-8 flag directly.
+
+Instead of C<:encoding(UTF-8)>, you can simply use C<:utf8>. This is widely
+accepted as good behavior.
+
+Instead of C<decode> and C<encode>, you could use C<_utf8_on> and C<_utf8_off>.
+But this is, contrary to C<:utf8>, considered bad style.
+
+There are some shortcuts for oneliners; see C<-C> in L<perlrun>.
+
+=head3 What if I don't know which encoding was used?
+
+Do whatever you can to find out, and if you have to: guess. (Don't forget to
+document your guess with a comment.)
+
+You could open the document in a web browser, and change the character set or
+character encoding until you can visually confirm that all characters look the
+way they should.
+
+There is no way to reliably detect the encoding automatically, so if people
+keep sending you data without charset indication, you may have to educate them.
+
+=head3 Can I use Unicode in my Perl sources?
+
+Yes, you can! If your sources are UTF-8 encoded, you can indicate that with the
+C<use utf8> pragma.
+
+    use utf8;
+
+This doesn't do anything to your input, or to your output. It only influences
+the way your sources are read. You can use Unicode in string literals, in
+identifiers (but they still have to be "word characters" according to C<\w>),
+and even in custom delimiters.
+
+=head3 Data::Dumper doesn't restore the UTF-8 flag; is it broken?
+
+No, Data::Dumper's Unicode abilities are as they should be. There have been
+some complaints that it should restore the UTF-8 flag when the data is read
+again with C<eval>. However, you should really not look at the flag, and
+nothing indicates that Data::Dumper should break this rule.
+
+Here's what happens: when Perl reads in a string literal, it sticks to 8 bit
+encoding as long as it can. (But perhaps originally it was internally encoded
+as UTF-8, when you dumped it.) When it has to give that up because other
+characters are added to the text string, it silently upgrades the string to
+UTF-8. 
+
+If you properly encode your strings for output, none of this is of your
+concern, and you can just C<eval> dumped data as always.
+
+=head3 How can I determine if a string is a text string or a binary string?
+
+You can't. Some use the UTF-8 flag for this, but that's misuse, and makes well
+behaved modules like Data::Dumper look bad. The flag is useless for this
+purpose, because it's off when an 8 bit encoding (by default ISO-8859-1) is
+used to store the string.
+
+This is something you, the programmer, has to keep track of; sorry. You could
+consider adopting a kind of "Hungarian notation" to help with this.
+
+=head3 How do I convert from encoding FOO to encoding BAR?
+
+By first converting the FOO-encoded byte string to a text string, and then the
+text string to a BAR-encoded byte string:
+
+    my $text_string = decode('FOO', $foo_string);
+    my $bar_string  = encode('BAR', $text_string);
+
+or by skipping the text string part, and going directly from one binary
+encoding to the other:
+
+    use Encode qw(from_to);
+    from_to($string, 'FOO', 'BAR');  # changes contents of $string
+
+or by letting automatic decoding and encoding do all the work:
+
+    open my $foofh, '<:encoding(FOO)', 'example.foo.txt';
+    open my $barfh, '>:encoding(BAR)', 'example.bar.txt';
+    print { $barfh } $_ while <$foofh>;
+
+=head3 What about the C<use bytes> pragma?
+
+Don't use it. It makes no sense to deal with bytes in a text string, and it
+makes no sense to deal with characters in a byte string. Do the proper
+conversions (by decoding/encoding), and things will work out well: you get
+character counts for decoded data, and byte counts for encoded data.
+
+C<use bytes> is usually a failed attempt to do something useful. Just forget
+about it.
+
+=head3 What are C<decode_utf8> and C<encode_utf8>?
+
+These are alternate syntaxes for C<decode('utf8', ...)> and C<encode('utf8',
+...)>.
+
+=head3 What's the difference between C<UTF-8> and C<utf8>?
+
+C<UTF-8> is the official standard. C<utf8> is Perl's way of being liberal in
+what it accepts. If you have to communicate with things that aren't so liberal,
+you may want to consider using C<UTF-8>. If you have to communicate with things
+that are too liberal, you may have to use C<utf8>. The full explanation is in
+L<Encode>.
+
+C<UTF-8> is internally known as C<utf-8-strict>. This tutorial uses UTF-8
+consistently, even where utf8 is actually used internally, because the
+distinction can be hard to make, and is mostly irrelevant.
+
+Okay, if you insist: the "internal format" is utf8, not UTF-8. (When it's not
+some other encoding.)
+
+=head3 I lost track; what encoding is the internal format really?
+
+It's good that you lost track, because you shouldn't depend on the internal
+format being any specific encoding. But since you asked: by default, the
+internal format is either ISO-8859-1 (latin-1), or utf8, depending on the
+history of the string.
+
+Perl knows how it stored the string internally, and will use that knowledge
+when you C<encode>. In other words: don't try to find out what the internal
+encoding for a certain string is, but instead just encode it into the encoding
+that you want.
+
+=head3 What character encodings does Perl support?
+
+To find out which character encodings your Perl supports, run:
+
+    perl -MEncode -le "print for Encode->encodings(':all')"
+
+=head3 Which version of perl should I use?
+
+Well, if you can, upgrade to the most recent, but certainly C<5.8.1> or newer.
+This tutorial is based on the status quo as of C<5.8.7>.
+
+You should also check your modules, and upgrade them if necessary. For example,
+HTML::Entities requires version >= 1.32 to function correctly, even though the
+changelog is silent about this.
+
+=head1 SUMMARY
+
+Decode everything you receive, encode everything you send out. (If it's text
+data.)
+
+=head1 ACKNOWLEDGEMENTS
+
+Thanks to Johan Vromans from Squirrel Consultancy. His UTF-8 rants during the
+Amsterdam Perl Mongers meetings got me interested and determined to find out
+how to use character encodings in Perl in ways that don't break easily.
+
+Thanks to Gerard Goossen from TTY. His presentation "UTF-8 in the wild" (Dutch
+Perl Workshop 2006) inspired me to publish my thoughts and write this tutorial.
+
+Thanks to the people who asked about this kind of stuff in several Perl IRC
+channels, and have constantly reminded me that a simpler explanation was
+needed.
+
+Thanks to the people who reviewed this document for me, before it went public.
+They are: Benjamin Smith, Jan-Pieter Cornet, Johan Vromans, Lukas Mai, Nathan
+Gray.
+
+=head1 AUTHOR
+
+Juerd Waalboer <juerd@cpan.org>
+
+=head1 SEE ALSO
+
+L<perlunicode>, L<perluniintro>, L<Encode>
+
diff --git a/vms/descrip_mms.template b/vms/descrip_mms.template

index b5839af..1a52f27 100644 (file)
--- a/vms/descrip_mms.template
+++ b/vms/descrip_mms.template
@@ -402,12 +402,12 @@ pod16 = [.lib.pods]perlmodinstall.pod [.lib.pods]perlmodlib.pod [.lib.pods]perlm
  pod17 = [.lib.pods]perlnewmod.pod [.lib.pods]perlnumber.pod [.lib.pods]perlobj.pod [.lib.pods]perlop.pod [.lib.pods]perlopenbsd.pod
  pod18 = [.lib.pods]perlopentut.pod [.lib.pods]perlos2.pod [.lib.pods]perlos390.pod [.lib.pods]perlos400.pod [.lib.pods]perlothrtut.pod
  pod19 = [.lib.pods]perlpacktut.pod [.lib.pods]perlplan9.pod [.lib.pods]perlpod.pod [.lib.pods]perlpodspec.pod [.lib.pods]perlport.pod [.lib.pods]perlqnx.pod
-pod20 = [.lib.pods]perlre.pod [.lib.pods]perlref.pod [.lib.pods]perlreftut.pod [.lib.pods]perlrequick.pod [.lib.pods]perlreref.pod [.lib.pods]perlretut.pod
-pod21 = [.lib.pods]perlriscos.pod [.lib.pods]perlrun.pod [.lib.pods]perlsec.pod [.lib.pods]perlsolaris.pod [.lib.pods]perlstyle.pod [.lib.pods]perlsub.pod
-pod22 = [.lib.pods]perlsyn.pod [.lib.pods]perlthrtut.pod [.lib.pods]perltie.pod [.lib.pods]perltoc.pod [.lib.pods]perltodo.pod [.lib.pods]perltooc.pod
-pod23 = [.lib.pods]perltoot.pod [.lib.pods]perltrap.pod [.lib.pods]perltru64.pod [.lib.pods]perltw.pod [.lib.pods]perlunicode.pod [.lib.pods]perluniintro.pod
-pod24 = [.lib.pods]perlutil.pod [.lib.pods]perluts.pod [.lib.pods]perlvar.pod [.lib.pods]perlvmesa.pod [.lib.pods]perlvms.pod [.lib.pods]perlvos.pod
-pod25 = [.lib.pods]perlwin32.pod [.lib.pods]perlxs.pod [.lib.pods]perlxstut.pod
+pod20 = [.lib.pods]perlre.pod [.lib.pods]perlref.pod [.lib.pods]perlreftut.pod [.lib.pods]perlreguts.pod [.lib.pods]perlrequick.pod [.lib.pods]perlreref.pod
+pod21 = [.lib.pods]perlretut.pod [.lib.pods]perlriscos.pod [.lib.pods]perlrun.pod [.lib.pods]perlsec.pod [.lib.pods]perlsolaris.pod [.lib.pods]perlstyle.pod
+pod22 = [.lib.pods]perlsub.pod [.lib.pods]perlsyn.pod [.lib.pods]perlthrtut.pod [.lib.pods]perltie.pod [.lib.pods]perltoc.pod [.lib.pods]perltodo.pod
+pod23 = [.lib.pods]perltooc.pod [.lib.pods]perltoot.pod [.lib.pods]perltrap.pod [.lib.pods]perltru64.pod [.lib.pods]perltw.pod [.lib.pods]perlunicode.pod
+pod24 = [.lib.pods]perluniintro.pod [.lib.pods]perlunitut.pod [.lib.pods]perlutil.pod [.lib.pods]perluts.pod [.lib.pods]perlvar.pod [.lib.pods]perlvmesa.pod
+pod25 = [.lib.pods]perlvms.pod [.lib.pods]perlvos.pod [.lib.pods]perlwin32.pod [.lib.pods]perlxs.pod [.lib.pods]perlxstut.pod
  pod = $(pod0) $(pod1) $(pod2) $(pod3) $(pod4) $(pod5) $(pod6) $(pod7) $(pod8) $(pod9) $(pod10) $(pod11) $(pod12) $(pod13) $(pod14) $(pod15) $(pod16) $(pod17) $(pod18) $(pod19) $(pod20) $(pod21) $(pod22) $(pod23) $(pod24) $(pod25)
  
  # Would be useful to automate the generation of this rule from pod/buildtoc
@@ -1129,6 +1129,10 @@ makeppport : $(MINIPERL_EXE) $(ARCHDIR)Config.pm
         @ If F$Search("[.lib]pods.dir").eqs."" Then Create/Directory [.lib.pods]
         Copy/NoConfirm/Log $(MMS$SOURCE) [.lib.pods]
  
+[.lib.pods]perlreguts.pod : [.pod]perlreguts.pod
+       @ If F$Search("[.lib]pods.dir").eqs."" Then Create/Directory [.lib.pods]
+       Copy/NoConfirm/Log $(MMS$SOURCE) [.lib.pods]
+
  [.lib.pods]perlrequick.pod : [.pod]perlrequick.pod
         @ If F$Search("[.lib]pods.dir").eqs."" Then Create/Directory [.lib.pods]
         Copy/NoConfirm/Log $(MMS$SOURCE) [.lib.pods]
@@ -1213,6 +1217,10 @@ makeppport : $(MINIPERL_EXE) $(ARCHDIR)Config.pm
         @ If F$Search("[.lib]pods.dir").eqs."" Then Create/Directory [.lib.pods]
         Copy/NoConfirm/Log $(MMS$SOURCE) [.lib.pods]
  
+[.lib.pods]perlunitut.pod : [.pod]perlunitut.pod
+       @ If F$Search("[.lib]pods.dir").eqs."" Then Create/Directory [.lib.pods]
+       Copy/NoConfirm/Log $(MMS$SOURCE) [.lib.pods]
+
  [.lib.pods]perlutil.pod : [.pod]perlutil.pod
         @ If F$Search("[.lib]pods.dir").eqs."" Then Create/Directory [.lib.pods]
         Copy/NoConfirm/Log $(MMS$SOURCE) [.lib.pods]
diff --git a/win32/pod.mak b/win32/pod.mak

index 2beac50..d6a6386 100644 (file)
--- a/win32/pod.mak
+++ b/win32/pod.mak
@@ -98,6 +98,7 @@ POD = \
         perlre.pod      \
         perlref.pod     \
         perlreftut.pod  \
+       perlreguts.pod  \
         perlrequick.pod \
         perlreref.pod   \
         perlretut.pod   \
@@ -115,6 +116,7 @@ POD = \
         perltrap.pod    \
         perlunicode.pod \
         perluniintro.pod        \
+       perlunitut.pod  \
         perlutil.pod    \
         perlvar.pod     \
         perlxs.pod      \
@@ -204,6 +206,7 @@ MAN = \
         perlre.man      \
         perlref.man     \
         perlreftut.man  \
+       perlreguts.man  \
         perlrequick.man \
         perlreref.man   \
         perlretut.man   \
@@ -221,6 +224,7 @@ MAN = \
         perltrap.man    \
         perlunicode.man \
         perluniintro.man        \
+       perlunitut.man  \
         perlutil.man    \
         perlvar.man     \
         perlxs.man      \
@@ -310,6 +314,7 @@ HTML = \
         perlre.html     \
         perlref.html    \
         perlreftut.html \
+       perlreguts.html \
         perlrequick.html        \
         perlreref.html  \
         perlretut.html  \
@@ -326,6 +331,7 @@ HTML = \
         perltrap.html   \
         perlunicode.html        \
         perluniintro.html       \
+       perlunitut.html \
         perlutil.html   \
         perlvar.html    \
         perlxs.html     \
@@ -416,6 +422,7 @@ TEX = \
         perlre.tex      \
         perlref.tex     \
         perlreftut.tex  \
+       perlreguts.tex  \
         perlrequick.tex \
         perlreref.tex   \
         perlretut.tex   \
@@ -433,6 +440,7 @@ TEX = \
         perltrap.tex    \
         perlunicode.tex \
         perluniintro.tex        \
+       perlunitut.tex  \
         perlutil.tex    \
         perlvar.tex     \
         perlxs.tex      \
author	Nicholas Clark <nick@ccl4.org>
	Mon, 5 Mar 2007 22:34:25 +0000 (22:34 +0000)
committer	Nicholas Clark <nick@ccl4.org>
	Mon, 5 Mar 2007 22:34:25 +0000 (22:34 +0000)
MANIFEST		patch \| blob \| blame \| history
pod.lst		patch \| blob \| blame \| history
pod/perl.pod		patch \| blob \| blame \| history
pod/perlreguts.pod	[new file with mode: 0644]	patch \| blob
pod/perltoc.pod		patch \| blob \| blame \| history
pod/perlunitut.pod	[new file with mode: 0644]	patch \| blob
vms/descrip_mms.template		patch \| blob \| blame \| history
win32/pod.mak		patch \| blob \| blame \| history