[perl #112786] Fix build under clang++

[perl5.git] / pod / perlreapi.pod
diff --git a/pod/perlreapi.pod b/pod/perlreapi.pod

index 3b5dc85..5e45620 100644 (file)
--- a/pod/perlreapi.pod
+++ b/pod/perlreapi.pod
@@ -4,9 +4,11 @@ perlreapi - perl regular expression plugin interface
  
  =head1 DESCRIPTION
  
-As of Perl 5.9.5 there is a new interface for using other regexp
-engines than the default one.  Each engine is supposed to provide
-access to a constant structure of the following format:
+As of Perl 5.9.5 there is a new interface for plugging and using other
+regular expression engines than the default one.
+
+Each engine is supposed to provide access to a constant structure of the
+following format:
  
      typedef struct regexp_engine {
          REGEXP* (*comp) (pTHX_ const SV * const pattern, const U32 flags);
@@ -34,7 +36,7 @@ access to a constant structure of the following format:
      #endif
  
  When a regexp is compiled, its C<engine> field is then set to point at
-the appropriate structure so that when it needs to be used Perl can find
+the appropriate structure, so that when it needs to be used Perl can find
  the right routines to do so.
  
  In order to install a new regexp handler, C<$^H{regcomp}> is set
@@ -61,7 +63,7 @@ the individual fields in the REGEXP struct.
  
  The C<pattern> parameter is the scalar that was used as the
  pattern. previous versions of perl would pass two C<char*> indicating
-the start and end of the stringifed pattern, the following snippet can
+the start and end of the stringified pattern, the following snippet can
  be used to get the old parameters:
  
      STRLEN plen;
@@ -75,7 +77,7 @@ expression (C<< "ook" =~ qr/eek/ >>). perl's own engine will always
  stringify everything using the snippet above but that doesn't mean
  other engines have to.
  
-The C<flags> paramater is a bitfield which indicates which of the
+The C<flags> parameter is a bitfield which indicates which of the
  C<msixp> flags the regex was compiled with. It also contains
  additional info such as whether C<use locale> is in effect.
  
@@ -118,34 +120,27 @@ TODO: Document those cases.
  
  =item C</p> - RXf_PMf_KEEPCOPY
  
-=back
-
-Additional flags:
-
-=over 4
-
-=item RXf_SKIPWHITE
+TODO: Document this
  
-If C<split> is invoked as C<split ' '> or with no arguments (which
-really means C<split(' ', $_>, see L<split|perlfunc/split>), perl will set
-this flag and change the pattern from C<" "> to C<"\s+"> before it's
-passed to the comp routine.
+=item Character set
  
-If the flag is present in C<< rx->extflags >> C<split> to delete
-whitespace from the start of the subject string before it's operated
-on. What is considered whitespace depends on whether the subject is a
-UTF-8 string and whether the C<RXf_PMf_LOCALE> flag is set.
+The character set semantics are determined by an enum that is contained
+in this field.  This is still experimental and subject to change, but
+the current interface returns the rules by use of the in-line function
+C<get_regex_charset(const U32 flags)>.  The only currently documented
+value returned from it is REGEX_LOCALE_CHARSET, which is set if
+C<use locale> is in effect. If present in C<< rx->extflags >>,
+C<split> will use the locale dependent definition of whitespace
+when RXf_SKIPWHITE or RXf_WHITE is in effect. ASCII whitespace
+is defined as per L<isSPACE|perlapi/isSPACE>, and by the internal
+macros C<is_utf8_space> under UTF-8, and C<isSPACE_LC> under C<use
+locale>.
  
-This probably always be preserved verbatim in C<< rx->extflags >>.
+=back
  
-=item RXf_PMf_LOCALE
+Additional flags:
  
-Set if C<use locale> is in effect. If present in C<< rx->extflags >>
-C<split> will use the locale dependant definition of whitespace under
-when RXf_SKIPWHITE or RXf_WHITE are in effect. Under ASCII whitespace
-is defined as per L<isSPACE|perlapi/ISSPACE>, and by the internal
-macros C<is_utf8_space> under UTF-8 and C<isSPACE_LC> under C<use
-locale>.
+=over 4
  
  =item RXf_UTF8
  
@@ -156,6 +151,16 @@ compilation. The perl engine for instance may upgrade non-UTF-8
  strings to UTF-8 if the pattern includes constructs such as C<\x{...}>
  that can only match Unicode values.
  
+=item RXf_SPLIT
+
+If C<split> is invoked as C<split ' '> or with no arguments (which
+really means C<split(' ', $_)>, see L<split|perlfunc/split>), perl will
+set this flag. The regex engine can then check for it and set the
+SKIPWHITE and WHITE extflags. To do this the perl engine does:
+
+    if (flags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ')
+        r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
+
  =back
  
  These flags can be set during compilation to enable optimizations in
@@ -163,6 +168,16 @@ the C<split> operator.
  
  =over 4
  
+=item RXf_SKIPWHITE
+
+If the flag is present in C<< rx->extflags >> C<split> will delete
+whitespace from the start of the subject string before it's operated
+on. What is considered whitespace depends on whether the subject is a
+UTF-8 string and whether the C<RXf_PMf_LOCALE> flag is set.
+
+If RXf_WHITE is set in addition to this flag C<split> will behave like
+C<split " "> under the perl engine.
+
  =item RXf_START_ONLY
  
  Tells the split operator to split the target string on newlines
@@ -180,9 +195,17 @@ without invoking the regex engine. The definition of whitespace varies
  depending on whether the target string is a UTF-8 string and on
  whether RXf_PMf_LOCALE is set.
  
-Perl's engine sets this flag if the pattern is C<\s+>, which it will be if
-the pattern actually was C<\s+> or if it was originally C<" "> (see
-C<RXf_SKIPWHITE> above).
+Perl's engine sets this flag if the pattern is C<\s+>.
+
+=item RXf_NULL
+
+Tells the split operator to split the target string on
+characters. The definition of character varies depending on whether
+the target string is a UTF-8 string.
+
+Perl's engine sets this flag on empty patterns, this optimization
+makes C<split //> much faster than it would otherwise be. It's even
+faster than C<unpack>.
  
  =back
  
@@ -227,9 +250,9 @@ perl will handle releasing anything else contained in the regexp structure.
  
  Called to get/set the value of C<$`>, C<$'>, C<$&> and their named
  equivalents, ${^PREMATCH}, ${^POSTMATCH} and $^{MATCH}, as well as the
-numbered capture buffers (C<$1>, C<$2>, ...).
+numbered capture groups (C<$1>, C<$2>, ...).
  
-The C<paren> paramater will be C<-2> for C<$`>, C<-1> for C<$'>, C<0>
+The C<paren> parameter will be C<-2> for C<$`>, C<-1> for C<$'>, C<0>
  for C<$&>, C<1> for C<$1> and so forth.
  
  The names have been chosen by analogy with L<Tie::Scalar> methods
@@ -266,12 +289,12 @@ sure this is used as the new value (or reject it).
  Example:
  
      if ("ook" =~ /(o*)/) {
-        # `paren' will be `1' and `value' will be `ee'
+        # 'paren' will be '1' and 'value' will be 'ee'
          $1 =~ tr/o/e/;
      }
  
  Perl's own engine will croak on any attempt to modify the capture
-variables, to do this in another engine use the following callack
+variables, to do this in another engine use the following callback
  (copied from C<Perl_reg_numbered_buff_store>):
  
      void
@@ -286,7 +309,7 @@ variables, to do this in another engine use the following callack
              Perl_croak(aTHX_ PL_no_modify);
      }
  
-Actually perl 5.10 will not I<always> croak in a statement that looks
+Actually perl will not I<always> croak in a statement that looks
  like it would modify a numbered capture variable. This is because the
  STORE callback will not be called if perl can determine that it
  doesn't have to modify the value. This is exactly how tied variables
@@ -301,7 +324,7 @@ behave in the same situation:
  
      package main;
  
-    tie my $sv => "CatptureVar";
+    tie my $sv => "CaptureVar";
      $sv =~ y/a/b/;
  
  Because C<$sv> is C<undef> when the C<y///> operator is applied to it
@@ -318,7 +341,7 @@ just die when assigned to in the default engine.
  Get the C<length> of a capture variable. There's a special callback
  for this so that perl doesn't have to do a FETCH and run C<length> on
  the result, since the length is (in perl's case) known from an offset
-stored in C<<rx->offs> this is much more efficient:
+stored in C<< rx->offs >> this is much more efficient:
  
      I32 s1  = rx->offs[paren].start;
      I32 s2  = rx->offs[paren].end;
@@ -345,27 +368,27 @@ currently defined:
  Which L<Tie::Hash> operation is being performed from the Perl level on
  C<%+> or C<%+>, if any:
  
-    RXf_HASH_FETCH
-    RXf_HASH_STORE
-    RXf_HASH_DELETE
-    RXf_HASH_CLEAR
-    RXf_HASH_EXISTS
-    RXf_HASH_SCALAR
-    RXf_HASH_FIRSTKEY
-    RXf_HASH_NEXTKEY
+    RXapif_FETCH
+    RXapif_STORE
+    RXapif_DELETE
+    RXapif_CLEAR
+    RXapif_EXISTS
+    RXapif_SCALAR
+    RXapif_FIRSTKEY
+    RXapif_NEXTKEY
  
  Whether C<%+> or C<%-> is being operated on, if any.
  
-    RXf_HASH_ONE /* %+ */
-    RXf_HASH_ALL /* %- */
+    RXapif_ONE /* %+ */
+    RXapif_ALL /* %- */
  
  Whether this is being called as C<re::regname>, C<re::regnames> or
  C<re::regnames_count>, if any. The first two will be combined with
-C<RXf_HASH_ONE> or C<RXf_HASH_ALL>.
+C<RXapif_ONE> or C<RXapif_ALL>.
  
-    RXf_HASH_REGNAME
-    RXf_HASH_REGNAMES
-    RXf_HASH_REGNAMES_COUNT
+    RXapif_REGNAME
+    RXapif_REGNAMES
+    RXapif_REGNAMES_COUNT
  
  Internally C<%+> and C<%-> are implemented with a real tied interface
  via L<Tie::Hash::NamedCapture>. The methods in that package will call
@@ -394,7 +417,7 @@ name for identification regardless of whether they implement methods
  on the object.
  
  The package this method returns should also have the internal
-C<Regexp> package in its C<@ISA>. C<qr//->isa("Regexp")> should always
+C<Regexp> package in its C<@ISA>. C<< qr//->isa("Regexp") >> should always
  be true regardless of what engine is being used.
  
  Example implementation might be:
@@ -426,11 +449,11 @@ Functions>.
      void* dupe(pTHX_ REGEXP * const rx, CLONE_PARAMS *param);
  
  On threaded builds a regexp may need to be duplicated so that the pattern
-can be used by mutiple threads. This routine is expected to handle the
+can be used by multiple threads. This routine is expected to handle the
  duplication of any private data pointed to by the C<pprivate> member of
  the regexp structure.  It will be called with the preconstructed new
  regexp structure as an argument, the C<pprivate> member will point at
-the B<old> private structue, and it is this routine's responsibility to
+the B<old> private structure, and it is this routine's responsibility to
  construct a copy and return a pointer to it (which perl will then use to
  overwrite the field as passed to this routine.)
  
@@ -476,7 +499,7 @@ values.
             in the final match, used for optimisations */
          struct reg_substr_data *substrs;
  
-        U32 nparens;  /* number of capture buffers */
+        U32 nparens;  /* number of capture groups */
  
          /* private engine specific data */
          U32 intflags;   /* Engine Specific Internal flags */
@@ -550,7 +573,7 @@ following pattern:
  where the C<minlen> would be 3 but C<minlenret> would only be 2 as the \d is
  required to match but is not actually included in the matched content. This
  distinction is particularly important as the substitution logic uses the
-C<minlenret> to tell whether it can do in-place substition which can result in
+C<minlenret> to tell whether it can do in-place substitution which can result in
  considerable speedup.
  
  =head2 C<gofs>
@@ -563,7 +586,7 @@ Substring data about strings that must appear in the final match. This
  is currently only used internally by perl's engine for but might be
  used in the future for all engines for optimisations.
  
-=head2 C<nparens>, C<lasparen>, and C<lastcloseparen>
+=head2 C<nparens>, C<lastparen>, and C<lastcloseparen>
  
  These fields are used to keep track of how many paren groups could be matched
  in the pattern, which was the last open paren to be entered, and which was
@@ -582,7 +605,7 @@ engine should use something else.
  
  =head2 C<swap>
  
-TODO: document
+Unused. Left in for compatibility with perl 5.10.0.
  
  =head2 C<offs>
  
@@ -596,7 +619,7 @@ C<regexp_paren_pair> struct is defined as follows:
      } regexp_paren_pair;
  
  If C<< ->offs[num].start >> or C<< ->offs[num].end >> is C<-1> then that
-capture buffer did not match. C<< ->offs[0].start/end >> represents C<$&> (or
+capture group did not match. C<< ->offs[0].start/end >> represents C<$&> (or
  C<${^MATCH> under C<//p>) and C<< ->offs[paren].end >> matches C<$$paren> where
  C<$paren >= 1>.
  
@@ -617,7 +640,7 @@ The relevant snippet from C<Perl_pp_regcomp>:
  
  =head2 C<paren_names>
  
-This is a hash used internally to track named capture buffers and their
+This is a hash used internally to track named capture groups and their
  offsets. The keys are the names of the buffers the values are dualvars,
  with the IV slot holding the number of buffers with the given name and the
  pv being an embedded array of I32.  The values may also be contained
@@ -639,7 +662,7 @@ Used during execution phase for managing search and replace patterns.
  =head2 C<wrapped> C<wraplen>
  
  Stores the string C<qr//> stringifies to. The perl engine for example
-stores C<(?-xism:eek)> in the case of C<qr/eek/>.
+stores C<(?^:eek)> in the case of C<qr/eek/>.
  
  When using a custom engine that doesn't support the C<(?:)> construct
  for inline modifiers, it's probably best to have C<qr//> stringify to