Add release date of 5.20.1-RC1

[perl5.git] / pod / perlreguts.pod
diff --git a/pod/perlreguts.pod b/pod/perlreguts.pod

index 75dc6dd..eac08f5 100644 (file)
--- a/pod/perlreguts.pod
+++ b/pod/perlreguts.pod
@@ -168,23 +168,29 @@ multiple of four bytes:
  
  =item C<regnode_charclass>
  
-Character classes are represented by C<regnode_charclass> structures,
-which have a four-byte argument and then a 32-byte (256-bit) bitmap
-indicating which characters are included in the class.
+Bracketed character classes are represented by C<regnode_charclass>
+structures, which have a four-byte argument and then a 32-byte (256-bit)
+bitmap indicating which characters in the Latin1 range are included in
+the class.
  
      regnode_charclass        U32 arg1;
                               char bitmap[ANYOF_BITMAP_SIZE];
  
-=item C<regnode_charclass_class>
+Various flags whose names begin with C<ANYOF_> are used for special
+situations.  Above Latin1 matches and things not known until run-time
+are stored in L</Perl's pprivate structure>.
+
+=item C<regnode_charclass_posixl>
  
  There is also a larger form of a char class structure used to represent
-POSIX char classes called C<regnode_charclass_class> which has an
-additional 4-byte (32-bit) bitmap indicating which POSIX char classes
+POSIX char classes under C</l> matching,
+called C<regnode_charclass_posixl> which has an
+additional 32-bit bitmap indicating which POSIX char classes
  have been included.
  
-   regnode_charclass_class  U32 arg1;
+   regnode_charclass_posixl U32 arg1;
                              char bitmap[ANYOF_BITMAP_SIZE];
-                            char classflags[ANYOF_CLASSBITMAP_SIZE];
+                            U32 classflags;
  
  =back
  
@@ -386,6 +392,52 @@ A grammar form might be something like this:
      piece : _piece
            | _piece quant
  
+=head3 Parsing complications
+
+The implication of the above description is that a pattern containing nested
+parentheses will result in a call graph which cycles through C<reg()>,
+C<regbranch()>, C<regpiece()>, C<regatom()>, C<reg()>, C<regbranch()> I<etc>
+multiple times, until the deepest level of nesting is reached. All the above
+routines return a pointer to a C<regnode>, which is usually the last regnode
+added to the program. However, one complication is that reg() returns NULL
+for parsing C<(?:)> syntax for embedded modifiers, setting the flag
+C<TRYAGAIN>. The C<TRYAGAIN> propagates upwards until it is captured, in
+some cases by C<regatom()>, but otherwise unconditionally by
+C<regbranch()>. Hence it will never be returned by C<regbranch()> to
+C<reg()>. This flag permits patterns such as C<(?i)+> to be detected as
+errors (I<Quantifier follows nothing in regex; marked by <-- HERE in m/(?i)+
+<-- HERE />).
+
+Another complication is that the representation used for the program differs
+if it needs to store Unicode, but it's not always possible to know for sure
+whether it does until midway through parsing. The Unicode representation for
+the program is larger, and cannot be matched as efficiently. (See L</Unicode
+and Localisation Support> below for more details as to why.)  If the pattern
+contains literal Unicode, it's obvious that the program needs to store
+Unicode. Otherwise, the parser optimistically assumes that the more
+efficient representation can be used, and starts sizing on this basis.
+However, if it then encounters something in the pattern which must be stored
+as Unicode, such as an C<\x{...}> escape sequence representing a character
+literal, then this means that all previously calculated sizes need to be
+redone, using values appropriate for the Unicode representation. Currently,
+all regular expression constructions which can trigger this are parsed by code
+in C<regatom()>.
+
+To avoid wasted work when a restart is needed, the sizing pass is abandoned
+- C<regatom()> immediately returns NULL, setting the flag C<RESTART_UTF8>.
+(This action is encapsulated using the macro C<REQUIRE_UTF8>.) This restart
+request is propagated up the call chain in a similar fashion, until it is
+"caught" in C<Perl_re_op_compile()>, which marks the pattern as containing
+Unicode, and restarts the sizing pass. It is also possible for constructions
+within run-time code blocks to turn out to need Unicode representation.,
+which is signalled by C<S_compile_runtime_code()> returning false to
+C<Perl_re_op_compile()>.
+
+The restart was previously implemented using a C<longjmp> in C<regatom()>
+back to a C<setjmp> in C<Perl_re_op_compile()>, but this proved to be
+problematic as the latter is a large function containing many automatic
+variables, which interact badly with the emergent control flow of C<setjmp>.
+
  =head3 Debug Output
  
  In the 5.9.x development version of perl you can C<< use re Debug => 'PARSE' >>
@@ -620,7 +672,7 @@ finding the start point in the string where we should match from,
  and the second being running the regop interpreter.
  
  If we can tell that there is no valid start point then we don't bother running
-interpreter at all. Likewise, if we know from the analysis phase that we
+the interpreter at all. Likewise, if we know from the analysis phase that we
  cannot detect a short-cut to the start position, we go straight to the
  interpreter.
  
@@ -715,40 +767,10 @@ Care must be taken when making changes to make sure that you handle
  UTF-8 properly, both at compile time and at execution time, including
  when the string and pattern are mismatched.
  
-The following comment in F<regcomp.h> gives an example of exactly how
-tricky this can be:
-
-    Two problematic code points in Unicode casefolding of EXACT nodes:
-
-    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
-    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
-
-    which casefold to
-
-    Unicode                      UTF-8
-
-    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
-    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
-
-    This means that in case-insensitive matching (or "loose matching",
-    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded
-    byte length of the above casefolded versions) can match a target
-    string of length two (the byte length of UTF-8 encoded U+0390 or
-    U+03B0). This would rather mess up the minimum length computation.
-
-    What we'll do is to look for the tail four bytes, and then peek
-    at the preceding two bytes to see whether we need to decrease
-    the minimum length by four (six minus two).
-
-    Thanks to the design of UTF-8, there cannot be false matches:
-    A sequence of valid UTF-8 bytes cannot be a subsequence of
-    another valid sequence of UTF-8 bytes.
-
-
  =head2 Base Structures
  
  The C<regexp> structure described in L<perlreapi> is common to all
-regex engines. Two of its fields that are intended for the private use
+regex engines. Two of its fields are intended for the private use
  of the regex engine that compiled the pattern. These are the
  C<intflags> and pprivate members. The C<pprivate> is a void pointer to
  an arbitrary structure whose use and management is the responsibility
@@ -765,7 +787,7 @@ the engine currently being. used and some of its fields read by perl to
  implement things such as the stringification of C<qr//>.
  
  
-The other structure is pointed to be the C<regexp> struct's
+The other structure is pointed to by the C<regexp> struct's
  C<pprivate> and is in addition to C<intflags> in the same struct
  considered to be the property of the regex engine which compiled the
  regular expression;
@@ -796,7 +818,6 @@ regex engine. Since it is specific to perl it is only of curiosity
  value to other engine implementations.
  
   typedef struct regexp_internal {
-         regexp_paren_ofs *swap; /* Swap copy of *startp / *endp */
           U32 *offsets;           /* offset annotations 20001228 MJD
                                    * data about mapping the program to
                                    * the string*/
@@ -814,15 +835,6 @@ value to other engine implementations.
  
  =over 5
  
-=item C<swap>
-
-C<swap> formerly was an extra set of startp/endp stored in a
-C<regexp_paren_ofs> struct. This was used when the last successful match
-was from the same pattern as the current pattern, so that a partial
-match didn't overwrite the previous match's results, but it caused a
-problem with re-entrant code such as trying to build the UTF-8 swashes.
-Currently unused and left for backward compatibility with 5.10.0.
-
  =item C<offsets>
  
  Offsets holds a mapping of offset in the C<program>
@@ -841,7 +853,7 @@ an independent synthetic regop that has been constructed by the optimiser.
  
  =item C<data>
  
-This field points at a reg_data structure, which is defined as follows
+This field points at a C<reg_data> structure, which is defined as follows
  
      struct reg_data {
          U32 count;