This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
improve SvPV_set's docs, it really shouldn't be public API
[perl5.git] / regcomp.c
index d488267..b62c30d 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -184,6 +184,7 @@ struct RExC_state_t {
     scan_frame *frame_head;
     scan_frame *frame_last;
     U32         frame_count;
+    U32         strict;
 #ifdef ADD_TO_REGEXEC
     char       *starttry;              /* -Dr: where regtry was called. */
 #define RExC_starttry  (pRExC_state->starttry)
@@ -253,6 +254,7 @@ struct RExC_state_t {
 #define RExC_frame_head (pRExC_state->frame_head)
 #define RExC_frame_last (pRExC_state->frame_last)
 #define RExC_frame_count (pRExC_state->frame_count)
+#define RExC_strict (pRExC_state->strict)
 
 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
  * a flag to disable back-off on the fixed/floating substrings - if it's
@@ -6532,6 +6534,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     RExC_uni_semantics = 0;
     RExC_contains_locale = 0;
     RExC_contains_i = 0;
+    RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
     pRExC_state->runtime_code_qr = NULL;
     RExC_frame_head= NULL;
     RExC_frame_last= NULL;
@@ -11648,7 +11651,7 @@ tryagain:
                        FALSE, /* means parse the whole char class */
                        TRUE, /* allow multi-char folds */
                        FALSE, /* don't silence non-portable warnings. */
-                       FALSE, /* not strict */
+                       RExC_strict,
                        NULL);
        if (*RExC_parse != ']') {
            RExC_parse = oregcomp_parse;
@@ -11884,7 +11887,7 @@ tryagain:
                                FALSE, /* don't silence non-portable warnings.
                                          It would be a bug if these returned
                                          non-portables */
-                               FALSE, /* not strict */
+                               RExC_strict,
                                NULL);
                 /* regclass() can only return RESTART_UTF8 if multi-char folds
                    are allowed.  */
@@ -12259,7 +12262,7 @@ tryagain:
                                                       &result,
                                                       &error_msg,
                                                       PASS2, /* out warnings */
-                                                       FALSE, /* not strict */
+                                                       RExC_strict,
                                                        TRUE, /* Output warnings
                                                                 for non-
                                                                 portables */
@@ -12288,8 +12291,8 @@ tryagain:
                                                       &result,
                                                       &error_msg,
                                                       PASS2, /* out warnings */
-                                                       FALSE, /* not strict */
-                                                       TRUE, /* Output warnings
+                                                       RExC_strict,
+                                                       TRUE, /* Silence warnings
                                                                 for non-
                                                                 portables */
                                                        UTF);
@@ -12322,8 +12325,8 @@ tryagain:
                          * from \1 - \9 is a backreference, any multi-digit
                          * escape which does not start with 0 and which when
                          * evaluated as decimal could refer to an already
-                         * parsed capture buffer is a backslash. Anything else
-                         * is octal.
+                         * parsed capture buffer is a back reference. Anything
+                         * else is octal.
                          *
                          * Note this implies that \118 could be interpreted as
                          * 118 OR as "\11" . "8" depending on whether there
@@ -13766,6 +13769,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                separate for a while from the non-complemented
                                versions because of complications with /d
                                matching */
+    SV* simple_posixes = NULL; /* But under some conditions, the classes can be
+                                  treated more simply than the general case,
+                                  leading to less compilation and execution
+                                  work */
     UV element_count = 0;   /* Number of distinct elements in the class.
                               Optimizations may be possible if this is tiny */
     AV * multi_char_matches = NULL; /* Code points that fold to more than one
@@ -14455,15 +14462,33 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                 &cp_list);
                     }
                 }
-                else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
+                else if (UNI_SEMANTICS
+                        || classnum == _CC_ASCII
+                        || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
+                                                  || classnum == _CC_XDIGIT)))
+                {
+                    /* We usually have to worry about /d and /a affecting what
+                     * POSIX classes match, with special code needed for /d
+                     * because we won't know until runtime what all matches.
+                     * But there is no extra work needed under /u, and
+                     * [:ascii:] is unaffected by /a and /d; and :digit: and
+                     * :xdigit: don't have runtime differences under /d.  So we
+                     * can special case these, and avoid some extra work below,
+                     * and at runtime. */
+                    _invlist_union_maybe_complement_2nd(
+                                                     simple_posixes,
+                                                     PL_XPosix_ptrs[classnum],
+                                                     namedclass % 2 != 0,
+                                                     &simple_posixes);
+                }
+                else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
                            complement and use nposixes */
                     SV** posixes_ptr = namedclass % 2 == 0
                                        ? &posixes
                                        : &nposixes;
-                    SV** source_ptr = &PL_XPosix_ptrs[classnum];
                     _invlist_union_maybe_complement_2nd(
                                                      *posixes_ptr,
-                                                     *source_ptr,
+                                                     PL_XPosix_ptrs[classnum],
                                                      namedclass % 2 != 0,
                                                      posixes_ptr);
                 }
@@ -14870,24 +14895,29 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     op = POSIXA;
                 }
             }
-            else if (prevvalue == 'A') {
-                if (value == 'Z'
+            else if (AT_LEAST_ASCII_RESTRICTED || ! FOLD) {
+                /* We can optimize A-Z or a-z, but not if they could match
+                 * something like the KELVIN SIGN under /i (/a means they
+                 * can't) */
+                if (prevvalue == 'A') {
+                    if (value == 'Z'
 #ifdef EBCDIC
-                    && literal_endpoint == 2
+                        && literal_endpoint == 2
 #endif
-                ) {
-                    arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
-                    op = POSIXA;
+                    ) {
+                        arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
+                        op = POSIXA;
+                    }
                 }
-            }
-            else if (prevvalue == 'a') {
-                if (value == 'z'
+                else if (prevvalue == 'a') {
+                    if (value == 'z'
 #ifdef EBCDIC
-                    && literal_endpoint == 2
+                        && literal_endpoint == 2
 #endif
-                ) {
-                    arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
-                    op = POSIXA;
+                    ) {
+                        arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
+                        op = POSIXA;
+                    }
                 }
             }
         }
@@ -14941,6 +14971,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
             SvREFCNT_dec(posixes);
             SvREFCNT_dec(nposixes);
+            SvREFCNT_dec(simple_posixes);
             SvREFCNT_dec(cp_list);
             SvREFCNT_dec(cp_foldable_list);
             return ret;
@@ -15098,6 +15129,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      * classes.  The lists are kept separate up to now because we don't want to
      * fold the classes (folding of those is automatically handled by the swash
      * fetching code) */
+    if (simple_posixes) {
+        _invlist_union(cp_list, simple_posixes, &cp_list);
+        SvREFCNT_dec_NN(simple_posixes);
+    }
     if (posixes || nposixes) {
         if (posixes && AT_LEAST_ASCII_RESTRICTED) {
             /* Under /a and /aa, nothing above ASCII matches these */
@@ -16482,13 +16517,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                 sv_catpvs(sv, "{non-utf8-latin1-all}");
             }
 
-            /* output information about the unicode matching */
             if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
                 sv_catpvs(sv, "{above_bitmap_all}");
-            else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
+
+            if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
                 SV *lv; /* Set if there is something outside the bit map. */
-                bool byte_output = FALSE;   /* If something in the bitmap has
-                                               been output */
+                bool byte_output = FALSE;   /* If something has been output */
                 SV *only_utf8_locale;
 
                 /* Get the stuff that wasn't in the bitmap.  'bitmap_invlist'