improve SvPV_set's docs, it really shouldn't be public API

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index d488267..b62c30d 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -184,6 +184,7 @@ struct RExC_state_t {
      scan_frame *frame_head;
      scan_frame *frame_last;
      U32         frame_count;
+    U32         strict;
  #ifdef ADD_TO_REGEXEC
      char       *starttry;              /* -Dr: where regtry was called. */
  #define RExC_starttry  (pRExC_state->starttry)
@@ -253,6 +254,7 @@ struct RExC_state_t {
  #define RExC_frame_head (pRExC_state->frame_head)
  #define RExC_frame_last (pRExC_state->frame_last)
  #define RExC_frame_count (pRExC_state->frame_count)
+#define RExC_strict (pRExC_state->strict)
  
  /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
   * a flag to disable back-off on the fixed/floating substrings - if it's
@@ -6532,6 +6534,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
      RExC_uni_semantics = 0;
      RExC_contains_locale = 0;
      RExC_contains_i = 0;
+    RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
      pRExC_state->runtime_code_qr = NULL;
      RExC_frame_head= NULL;
      RExC_frame_last= NULL;
@@ -11648,7 +11651,7 @@ tryagain:
                         FALSE, /* means parse the whole char class */
                         TRUE, /* allow multi-char folds */
                         FALSE, /* don't silence non-portable warnings. */
-                       FALSE, /* not strict */
+                       RExC_strict,
                         NULL);
         if (*RExC_parse != ']') {
             RExC_parse = oregcomp_parse;
@@ -11884,7 +11887,7 @@ tryagain:
                                 FALSE, /* don't silence non-portable warnings.
                                           It would be a bug if these returned
                                           non-portables */
-                               FALSE, /* not strict */
+                               RExC_strict,
                                 NULL);
                  /* regclass() can only return RESTART_UTF8 if multi-char folds
                     are allowed.  */
@@ -12259,7 +12262,7 @@ tryagain:
                                                        &result,
                                                        &error_msg,
                                                        PASS2, /* out warnings */
-                                                       FALSE, /* not strict */
+                                                       RExC_strict,
                                                         TRUE, /* Output warnings
                                                                  for non-
                                                                  portables */
@@ -12288,8 +12291,8 @@ tryagain:
                                                        &result,
                                                        &error_msg,
                                                        PASS2, /* out warnings */
-                                                       FALSE, /* not strict */
-                                                       TRUE, /* Output warnings
+                                                       RExC_strict,
+                                                       TRUE, /* Silence warnings
                                                                  for non-
                                                                  portables */
                                                         UTF);
@@ -12322,8 +12325,8 @@ tryagain:
                           * from \1 - \9 is a backreference, any multi-digit
                           * escape which does not start with 0 and which when
                           * evaluated as decimal could refer to an already
-                         * parsed capture buffer is a backslash. Anything else
-                         * is octal.
+                         * parsed capture buffer is a back reference. Anything
+                         * else is octal.
                           *
                           * Note this implies that \118 could be interpreted as
                           * 118 OR as "\11" . "8" depending on whether there
@@ -13766,6 +13769,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                 separate for a while from the non-complemented
                                 versions because of complications with /d
                                 matching */
+    SV* simple_posixes = NULL; /* But under some conditions, the classes can be
+                                  treated more simply than the general case,
+                                  leading to less compilation and execution
+                                  work */
      UV element_count = 0;   /* Number of distinct elements in the class.
                                Optimizations may be possible if this is tiny */
      AV * multi_char_matches = NULL; /* Code points that fold to more than one
@@ -14455,15 +14462,33 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                  &cp_list);
                      }
                  }
-                else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
+                else if (UNI_SEMANTICS
+                        || classnum == _CC_ASCII
+                        || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
+                                                  || classnum == _CC_XDIGIT)))
+                {
+                    /* We usually have to worry about /d and /a affecting what
+                     * POSIX classes match, with special code needed for /d
+                     * because we won't know until runtime what all matches.
+                     * But there is no extra work needed under /u, and
+                     * [:ascii:] is unaffected by /a and /d; and :digit: and
+                     * :xdigit: don't have runtime differences under /d.  So we
+                     * can special case these, and avoid some extra work below,
+                     * and at runtime. */
+                    _invlist_union_maybe_complement_2nd(
+                                                     simple_posixes,
+                                                     PL_XPosix_ptrs[classnum],
+                                                     namedclass % 2 != 0,
+                                                     &simple_posixes);
+                }
+                else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
                             complement and use nposixes */
                      SV** posixes_ptr = namedclass % 2 == 0
                                         ? &posixes
                                         : &nposixes;
-                    SV** source_ptr = &PL_XPosix_ptrs[classnum];
                      _invlist_union_maybe_complement_2nd(
                                                       *posixes_ptr,
-                                                     *source_ptr,
+                                                     PL_XPosix_ptrs[classnum],
                                                       namedclass % 2 != 0,
                                                       posixes_ptr);
                  }
@@ -14870,24 +14895,29 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                      op = POSIXA;
                  }
              }
-            else if (prevvalue == 'A') {
-                if (value == 'Z'
+            else if (AT_LEAST_ASCII_RESTRICTED || ! FOLD) {
+                /* We can optimize A-Z or a-z, but not if they could match
+                 * something like the KELVIN SIGN under /i (/a means they
+                 * can't) */
+                if (prevvalue == 'A') {
+                    if (value == 'Z'
  #ifdef EBCDIC
-                    && literal_endpoint == 2
+                        && literal_endpoint == 2
  #endif
-                ) {
-                    arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
-                    op = POSIXA;
+                    ) {
+                        arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
+                        op = POSIXA;
+                    }
                  }
-            }
-            else if (prevvalue == 'a') {
-                if (value == 'z'
+                else if (prevvalue == 'a') {
+                    if (value == 'z'
  #ifdef EBCDIC
-                    && literal_endpoint == 2
+                        && literal_endpoint == 2
  #endif
-                ) {
-                    arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
-                    op = POSIXA;
+                    ) {
+                        arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
+                        op = POSIXA;
+                    }
                  }
              }
          }
@@ -14941,6 +14971,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
              SvREFCNT_dec(posixes);
              SvREFCNT_dec(nposixes);
+            SvREFCNT_dec(simple_posixes);
              SvREFCNT_dec(cp_list);
              SvREFCNT_dec(cp_foldable_list);
              return ret;
@@ -15098,6 +15129,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
       * classes.  The lists are kept separate up to now because we don't want to
       * fold the classes (folding of those is automatically handled by the swash
       * fetching code) */
+    if (simple_posixes) {
+        _invlist_union(cp_list, simple_posixes, &cp_list);
+        SvREFCNT_dec_NN(simple_posixes);
+    }
      if (posixes || nposixes) {
          if (posixes && AT_LEAST_ASCII_RESTRICTED) {
              /* Under /a and /aa, nothing above ASCII matches these */
@@ -16482,13 +16517,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                  sv_catpvs(sv, "{non-utf8-latin1-all}");
              }
  
-            /* output information about the unicode matching */
              if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
                  sv_catpvs(sv, "{above_bitmap_all}");
-            else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
+
+            if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
                  SV *lv; /* Set if there is something outside the bit map. */
-                bool byte_output = FALSE;   /* If something in the bitmap has
-                                               been output */
+                bool byte_output = FALSE;   /* If something has been output */
                  SV *only_utf8_locale;
  
                  /* Get the stuff that wasn't in the bitmap.  'bitmap_invlist'