This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
improve SvPV_set's docs, it really shouldn't be public API
[perl5.git] / regcomp.c
index 56e1a25..b62c30d 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -184,6 +184,7 @@ struct RExC_state_t {
     scan_frame *frame_head;
     scan_frame *frame_last;
     U32         frame_count;
+    U32         strict;
 #ifdef ADD_TO_REGEXEC
     char       *starttry;              /* -Dr: where regtry was called. */
 #define RExC_starttry  (pRExC_state->starttry)
@@ -253,6 +254,7 @@ struct RExC_state_t {
 #define RExC_frame_head (pRExC_state->frame_head)
 #define RExC_frame_last (pRExC_state->frame_last)
 #define RExC_frame_count (pRExC_state->frame_count)
+#define RExC_strict (pRExC_state->strict)
 
 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
  * a flag to disable back-off on the fixed/floating substrings - if it's
@@ -702,12 +704,6 @@ static const scan_data_t zero_scan_data =
            a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 } STMT_END
 
-
-/* Allow for side effects in s */
-#define REGC(c,s) STMT_START {                 \
-    if (!SIZE_ONLY) *(s) = (c); else (void)(s);        \
-} STMT_END
-
 /* Macros for recording node offsets.   20001227 mjd@plover.com
  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
@@ -6538,6 +6534,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     RExC_uni_semantics = 0;
     RExC_contains_locale = 0;
     RExC_contains_i = 0;
+    RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
     pRExC_state->runtime_code_qr = NULL;
     RExC_frame_head= NULL;
     RExC_frame_last= NULL;
@@ -6648,10 +6645,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     RExC_recurse_count = 0;
     pRExC_state->code_index = 0;
 
-#if 0 /* REGC() is (currently) a NOP at the first pass.
-       * Clever compilers notice this and complain. --jhi */
-    REGC((U8)REG_MAGIC, (char*)RExC_emit);
-#endif
     DEBUG_PARSE_r(
        PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
         RExC_lastnum=0;
@@ -6865,7 +6858,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     RExC_emit_bound = ri->program + RExC_size + 1;
     pRExC_state->code_index = 0;
 
-    REGC((U8)REG_MAGIC, (char*) RExC_emit++);
+    *((char*) RExC_emit++) = (char) REG_MAGIC;
     if (reg(pRExC_state, 0, &flags,1) == NULL) {
        ReREFCNT_dec(rx);
         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
@@ -11658,6 +11651,7 @@ tryagain:
                        FALSE, /* means parse the whole char class */
                        TRUE, /* allow multi-char folds */
                        FALSE, /* don't silence non-portable warnings. */
+                       RExC_strict,
                        NULL);
        if (*RExC_parse != ']') {
            RExC_parse = oregcomp_parse;
@@ -11893,6 +11887,7 @@ tryagain:
                                FALSE, /* don't silence non-portable warnings.
                                          It would be a bug if these returned
                                          non-portables */
+                               RExC_strict,
                                NULL);
                 /* regclass() can only return RESTART_UTF8 if multi-char folds
                    are allowed.  */
@@ -12267,7 +12262,7 @@ tryagain:
                                                       &result,
                                                       &error_msg,
                                                       PASS2, /* out warnings */
-                                                       FALSE, /* not strict */
+                                                       RExC_strict,
                                                        TRUE, /* Output warnings
                                                                 for non-
                                                                 portables */
@@ -12296,8 +12291,8 @@ tryagain:
                                                       &result,
                                                       &error_msg,
                                                       PASS2, /* out warnings */
-                                                       FALSE, /* not strict */
-                                                       TRUE, /* Output warnings
+                                                       RExC_strict,
+                                                       TRUE, /* Silence warnings
                                                                 for non-
                                                                 portables */
                                                        UTF);
@@ -12330,8 +12325,8 @@ tryagain:
                          * from \1 - \9 is a backreference, any multi-digit
                          * escape which does not start with 0 and which when
                          * evaluated as decimal could refer to an already
-                         * parsed capture buffer is a backslash. Anything else
-                         * is octal.
+                         * parsed capture buffer is a back reference. Anything
+                         * else is octal.
                          *
                          * Note this implies that \118 could be interpreted as
                          * 118 OR as "\11" . "8" depending on whether there
@@ -12534,8 +12529,7 @@ tryagain:
                      * unfolded, and we have to calculate how many EXACTish
                      * nodes it will take; and we may run out of room in a node
                      * in the middle of a potential multi-char fold, and have
-                     * to back off accordingly.  (Hence we can't use REGC for
-                     * the simple case just below.) */
+                     * to back off accordingly.  */
 
                     UV folded;
                     if (isASCII_uni(ender)) {
@@ -13189,7 +13183,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                                                      posix class */
                                   FALSE, /* don't allow multi-char folds */
                                   TRUE, /* silence non-portable warnings. */
-                                  &current))
+                                  TRUE, /* strict */
+                                  &current
+                                 ))
                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
                               (UV) *flagp);
 
@@ -13356,7 +13352,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                               TRUE, /* means parse just the next thing */
                               FALSE, /* don't allow multi-char folds */
                               FALSE, /* don't silence non-portable warnings.  */
-                              &current))
+                              TRUE,  /* strict */
+                              &current
+                             ))
                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
                           (UV) *flagp);
                 /* regclass() will return with parsing just the \ sequence,
@@ -13379,7 +13377,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                                                 only if not a posix class */
                              FALSE, /* don't allow multi-char folds */
                              FALSE, /* don't silence non-portable warnings.  */
-                             &current))
+                             TRUE,   /* strict */
+                             &current
+                            ))
                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
                           (UV) *flagp);
                 /* function call leaves parse pointing to the ']', except if we
@@ -13580,7 +13580,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                     TRUE, /* silence non-portable warnings.  The above may very
                              well have generated non-portable code points, but
                              they're valid on this machine */
-                    NULL);
+                    FALSE, /* similarly, no need for strict */
+                    NULL
+                );
     if (!node)
         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
                     PTR2UV(flagp));
@@ -13716,7 +13718,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                  const bool silence_non_portable,   /* Don't output warnings
                                                        about too large
                                                        characters */
-                 SV** ret_invlist)  /* Return an inversion list, not a node */
+                 const bool strict,
+                 SV** ret_invlist  /* Return an inversion list, not a node */
+          )
 {
     /* parse a bracketed class specification.  Most of these will produce an
      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
@@ -13765,6 +13769,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                separate for a while from the non-complemented
                                versions because of complications with /d
                                matching */
+    SV* simple_posixes = NULL; /* But under some conditions, the classes can be
+                                  treated more simply than the general case,
+                                  leading to less compilation and execution
+                                  work */
     UV element_count = 0;   /* Number of distinct elements in the class.
                               Optimizations may be possible if this is tiny */
     AV * multi_char_matches = NULL; /* Code points that fold to more than one
@@ -13773,7 +13781,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
     char * stop_ptr = RExC_end;    /* where to stop parsing */
     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
                                                    space? */
-    const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
 
     /* Unicode properties are stored in a swash; this holds the current one
      * being parsed.  If this swash is the only above-latin1 component of the
@@ -14455,15 +14462,33 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                 &cp_list);
                     }
                 }
-                else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
+                else if (UNI_SEMANTICS
+                        || classnum == _CC_ASCII
+                        || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
+                                                  || classnum == _CC_XDIGIT)))
+                {
+                    /* We usually have to worry about /d and /a affecting what
+                     * POSIX classes match, with special code needed for /d
+                     * because we won't know until runtime what all matches.
+                     * But there is no extra work needed under /u, and
+                     * [:ascii:] is unaffected by /a and /d; and :digit: and
+                     * :xdigit: don't have runtime differences under /d.  So we
+                     * can special case these, and avoid some extra work below,
+                     * and at runtime. */
+                    _invlist_union_maybe_complement_2nd(
+                                                     simple_posixes,
+                                                     PL_XPosix_ptrs[classnum],
+                                                     namedclass % 2 != 0,
+                                                     &simple_posixes);
+                }
+                else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
                            complement and use nposixes */
                     SV** posixes_ptr = namedclass % 2 == 0
                                        ? &posixes
                                        : &nposixes;
-                    SV** source_ptr = &PL_XPosix_ptrs[classnum];
                     _invlist_union_maybe_complement_2nd(
                                                      *posixes_ptr,
-                                                     *source_ptr,
+                                                     PL_XPosix_ptrs[classnum],
                                                      namedclass % 2 != 0,
                                                      posixes_ptr);
                 }
@@ -14870,24 +14895,29 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     op = POSIXA;
                 }
             }
-            else if (prevvalue == 'A') {
-                if (value == 'Z'
+            else if (AT_LEAST_ASCII_RESTRICTED || ! FOLD) {
+                /* We can optimize A-Z or a-z, but not if they could match
+                 * something like the KELVIN SIGN under /i (/a means they
+                 * can't) */
+                if (prevvalue == 'A') {
+                    if (value == 'Z'
 #ifdef EBCDIC
-                    && literal_endpoint == 2
+                        && literal_endpoint == 2
 #endif
-                ) {
-                    arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
-                    op = POSIXA;
+                    ) {
+                        arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
+                        op = POSIXA;
+                    }
                 }
-            }
-            else if (prevvalue == 'a') {
-                if (value == 'z'
+                else if (prevvalue == 'a') {
+                    if (value == 'z'
 #ifdef EBCDIC
-                    && literal_endpoint == 2
+                        && literal_endpoint == 2
 #endif
-                ) {
-                    arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
-                    op = POSIXA;
+                    ) {
+                        arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
+                        op = POSIXA;
+                    }
                 }
             }
         }
@@ -14941,6 +14971,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
             SvREFCNT_dec(posixes);
             SvREFCNT_dec(nposixes);
+            SvREFCNT_dec(simple_posixes);
             SvREFCNT_dec(cp_list);
             SvREFCNT_dec(cp_foldable_list);
             return ret;
@@ -15098,6 +15129,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      * classes.  The lists are kept separate up to now because we don't want to
      * fold the classes (folding of those is automatically handled by the swash
      * fetching code) */
+    if (simple_posixes) {
+        _invlist_union(cp_list, simple_posixes, &cp_list);
+        SvREFCNT_dec_NN(simple_posixes);
+    }
     if (posixes || nposixes) {
         if (posixes && AT_LEAST_ASCII_RESTRICTED) {
             /* Under /a and /aa, nothing above ASCII matches these */
@@ -15837,17 +15872,6 @@ S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const
 }
 
 /*
-- reguni - emit (if appropriate) a Unicode character
-*/
-PERL_STATIC_INLINE STRLEN
-S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
-{
-    PERL_ARGS_ASSERT_REGUNI;
-
-    return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
-}
-
-/*
 - reginsert - insert an operator in front of already-emitted operand
 *
 * Means relocating the operand.
@@ -16493,13 +16517,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                 sv_catpvs(sv, "{non-utf8-latin1-all}");
             }
 
-            /* output information about the unicode matching */
             if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
                 sv_catpvs(sv, "{above_bitmap_all}");
-            else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
+
+            if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
                 SV *lv; /* Set if there is something outside the bit map. */
-                bool byte_output = FALSE;   /* If something in the bitmap has
-                                               been output */
+                bool byte_output = FALSE;   /* If something has been output */
                 SV *only_utf8_locale;
 
                 /* Get the stuff that wasn't in the bitmap.  'bitmap_invlist'
@@ -16628,21 +16651,22 @@ Perl_re_intuit_string(pTHX_ REGEXP * const r)
 
     DEBUG_COMPILE_r(
        {
-           const char * const s = SvPV_nolen_const(prog->check_substr
-                     ? prog->check_substr : prog->check_utf8);
+           const char * const s = SvPV_nolen_const(RX_UTF8(r)
+                     ? prog->check_utf8 : prog->check_substr);
 
            if (!PL_colorset) reginitcolors();
            PerlIO_printf(Perl_debug_log,
                      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
                      PL_colors[4],
-                     prog->check_substr ? "" : "utf8 ",
+                     RX_UTF8(r) ? "utf8 " : "",
                      PL_colors[5],PL_colors[0],
                      s,
                      PL_colors[1],
                      (strlen(s) > 60 ? "..." : ""));
        } );
 
-    return prog->check_substr ? prog->check_substr : prog->check_utf8;
+    /* use UTF8 check substring if regexp pattern itself is in UTF8 */
+    return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
 }
 
 /*