This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Revert "regcomp.c: Use a weird value in a place where ignored"
[perl5.git] / regcomp.c
index 7ffba08..9dac06c 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -307,7 +307,8 @@ struct RExC_state_t {
  * Flags to be passed up and down.
  */
 #define        WORST           0       /* Worst case. */
-#define        HASWIDTH        0x01    /* Known to match non-null strings. */
+#define        HASWIDTH        0x01    /* Known to not match null strings, could match
+                                   non-null ones. */
 
 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
  * character.  (There needs to be a case: in the switch statement in regexec.c
@@ -2648,7 +2649,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
 #endif
 
     switch (flags) {
-        case EXACT: case EXACTL: break;
+        case EXACT: case EXACT_ONLY8: case EXACTL: break;
        case EXACTFAA:
         case EXACTFU_SS:
        case EXACTFU:
@@ -2663,7 +2664,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
     trie->wordcount = word_count;
     RExC_rxi->data->data[ data_slot ] = (void*)trie;
     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
-    if (flags == EXACT || flags == EXACTL)
+    if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL)
        trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
@@ -2737,15 +2738,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
                 noper= noper_next;
         }
 
-        if ( noper < tail &&
-                (
-                    OP(noper) == flags ||
-                    (
-                        flags == EXACTFU &&
-                        OP(noper) == EXACTFU_SS
-                    )
-                )
-        ) {
+        if (    noper < tail
+            && (    OP(noper) == flags
+                || (flags == EXACT && OP(noper) == EXACT_ONLY8)
+                || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
+                                         || OP(noper) == EXACTFU_SS))) )
+        {
             uc= (U8*)STRING(noper);
             e= uc + STR_LEN(noper);
         } else {
@@ -2958,7 +2956,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
                     noper= noper_next;
             }
 
-            if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
+            if (    noper < tail
+                && (    OP(noper) == flags
+                    || (flags == EXACT && OP(noper) == EXACT_ONLY8)
+                    || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
+                                             || OP(noper) == EXACTFU_SS))) )
+            {
                 const U8 *uc= (U8*)STRING(noper);
                 const U8 *e= uc + STR_LEN(noper);
 
@@ -3178,7 +3181,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
                     noper= noper_next;
             }
 
-            if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
+            if (    noper < tail
+                && (    OP(noper) == flags
+                    || (flags == EXACT && OP(noper) == EXACT_ONLY8)
+                    || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
+                                             || OP(noper) == EXACTFU_SS))) )
+            {
                 const U8 *uc= (U8*)STRING(noper);
                 const U8 *e= uc + STR_LEN(noper);
 
@@ -4011,7 +4019,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
      * this final joining, sequences could have been split over boundaries, and
      * hence missed).  The sequences only happen in folding, hence for any
      * non-EXACT EXACTish node */
-    if (OP(scan) != EXACT && OP(scan) != EXACTL) {
+    if (OP(scan) != EXACT && OP(scan) != EXACT_ONLY8 && OP(scan) != EXACTL) {
         U8* s0 = (U8*) STRING(scan);
         U8* s = s0;
         U8* s_end = s0 + STR_LEN(scan);
@@ -4664,9 +4672,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                 ----------------+-----------
                                 NOTHING         | NOTHING
                                 EXACT           | EXACT
+                                EXACT_ONLY8     | EXACT
                                 EXACTFU         | EXACTFU
+                                EXACTFU_ONLY8   | EXACTFU
                                 EXACTFU_SS      | EXACTFU
-                                EXACTFAA         | EXACTFAA
+                                EXACTFAA        | EXACTFAA
                                 EXACTL          | EXACTL
                                 EXACTFLU8       | EXACTFLU8
 
@@ -4674,16 +4684,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         */
 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
                        ? NOTHING                                            \
-                       : ( EXACT == (X) )                                   \
+                       : ( EXACT == (X) || EXACT_ONLY8 == (X) )             \
                          ? EXACT                                            \
-                         : ( EXACTFU == (X) || EXACTFU_SS == (X) )          \
+                         : (     EXACTFU == (X)                             \
+                              || EXACTFU_ONLY8 == (X)                       \
+                              || EXACTFU_SS == (X) )                        \
                            ? EXACTFU                                        \
-                           : ( EXACTFAA == (X) )                             \
-                             ? EXACTFAA                                      \
+                           : ( EXACTFAA == (X) )                            \
+                             ? EXACTFAA                                     \
                              : ( EXACTL == (X) )                            \
                                ? EXACTL                                     \
-                               : ( EXACTFLU8 == (X) )                        \
-                                 ? EXACTFLU8                                 \
+                               : ( EXACTFLU8 == (X) )                       \
+                                 ? EXACTFLU8                                \
                                  : 0 )
 
                         /* dont use tail as the end marker for this traverse */
@@ -4998,7 +5010,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                continue;
            }
        }
-       else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
+       else if (   OP(scan) == EXACT
+                 || OP(scan) == EXACT_ONLY8
+                 || OP(scan) == EXACTL)
+        {
            SSize_t l = STR_LEN(scan);
            UV uc;
             assert(l);
@@ -5117,7 +5132,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            case PLUS:
                if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
                    next = NEXTOPER(scan);
-                   if (OP(next) == EXACT
+                   if (   OP(next) == EXACT
+                        || OP(next) == EXACT_ONLY8
                         || OP(next) == EXACTL
                         || (flags & SCF_DO_STCLASS))
                     {
@@ -6755,13 +6771,27 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
                && n < pRExC_state->code_blocks->count
                && s == pRExC_state->code_blocks->cb[n].start)
            {
-               /* blank out literal code block */
-               assert(pat[s] == '(');
-               while (s <= pRExC_state->code_blocks->cb[n].end) {
-                   *p++ = '_';
+               /* blank out literal code block so that they aren't
+                 * recompiled: eg change from/to:
+                 *     /(?{xyz})/
+                 *     /(?=====)/
+                 * and
+                 *     /(??{xyz})/
+                 *     /(?======)/
+                 * and
+                 *     /(?(?{xyz}))/
+                 *     /(?(?=====))/
+                */
+               assert(pat[s]   == '(');
+               assert(pat[s+1] == '?');
+                *p++ = '(';
+                *p++ = '?';
+                s += 2;
+               while (s < pRExC_state->code_blocks->cb[n].end) {
+                   *p++ = '=';
                    s++;
                }
-               s--;
+                *p++ = ')';
                n++;
                continue;
            }
@@ -7534,9 +7564,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
         goto redo_parse;
     }
 
-    /* In a stable state, as here, this must be true */
-    assert(RExC_size = RExC_emit + 1);
-
     /* Here, we have successfully parsed and generated the pattern's program
      * for the regex engine.  We are ready to finish things up and look for
      * optimizations. */
@@ -7701,8 +7728,12 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
         DEBUG_PEEP("first:", first, 0, 0);
         /* Ignore EXACT as we deal with it later. */
        if (PL_regkind[OP(first)] == EXACT) {
-           if (OP(first) == EXACT || OP(first) == EXACTL)
+           if (   OP(first) == EXACT
+                || OP(first) == EXACT_ONLY8
+                || OP(first) == EXACTL)
+            {
                NOOP;   /* Empty, get anchored substr later. */
+            }
            else
                RExC_rxi->regstclass = first;
        }
@@ -8044,7 +8075,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
                  && nop == END)
             RExC_rx->extflags |= RXf_WHITE;
         else if ( RExC_rx->extflags & RXf_SPLIT
-                  && (fop == EXACT || fop == EXACTL)
+                  && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL)
                   && STR_LEN(first) == 1
                   && *(STRING(first)) == ' '
                   && nop == END )
@@ -13728,7 +13759,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              * contain only above-Latin1 characters (hence must be in UTF8),
              * which don't participate in folds with Latin1-range characters,
              * as the latter's folds aren't known until runtime. */
-            bool maybe_exactfu = TRUE;
+            bool maybe_exactfu = FOLD;
+
+            /* Does this node contain something that can't match unless the
+             * target string is (also) in UTF-8 */
+            bool requires_utf8_target = FALSE;
+
+            bool has_micro_sign = FALSE;
 
             /* Allocate an EXACT node.  The node_type may change below to
              * another EXACTish node, but since the size of the node doesn't
@@ -13852,9 +13889,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                         }
                         p = RExC_parse;
                         RExC_parse = parse_start;
-                        if (ender > 0xff) {
-                            REQUIRE_UTF8(flagp);
-                        }
 
                         /* The \N{} means the pattern, if previously /d,
                          * becomes /u.  That means it can't be an EXACTF node,
@@ -13868,7 +13902,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                             if (! maybe_exactfu) {
                                 len = 0;
                                 s = s0;
-                                maybe_exactfu = TRUE;   /* Prob. unnecessary */
+                                maybe_exactfu = FOLD;   /* Prob. unnecessary */
                                 goto reparse;
                             }
                         }
@@ -13916,9 +13950,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                            }
                             UPDATE_WARNINGS_LOC(p - 1);
                             ender = result;
-                           if (ender > 0xff) {
-                               REQUIRE_UTF8(flagp);
-                           }
                            break;
                        }
                    case 'x':
@@ -13952,9 +13983,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                                 }
 #endif
                            }
-                            else {
-                               REQUIRE_UTF8(flagp);
-                           }
                            break;
                        }
                    case 'c':
@@ -13999,9 +14027,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                            I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
                            STRLEN numlen = 3;
                            ender = grok_oct(p, &numlen, &flags, NULL);
-                           if (ender > 0xff) {
-                               REQUIRE_UTF8(flagp);
-                           }
                            p += numlen;
                             if (   isDIGIT(*p)  /* like \08, \178 */
                                 && ckWARN(WARN_REGEXP)
@@ -14082,7 +14107,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
                /* Here, have looked at the literal character, and <ender>
                  * contains its ordinal; <p> points to the character after it.
-                 * We need to check if the next non-ignored thing is a
+                 * */
+
+                if (ender > 255) {
+                    REQUIRE_UTF8(flagp);
+                }
+
+                /* We need to check if the next non-ignored thing is a
                  * quantifier.  Move <p> to after anything that should be
                  * ignored, which, as a side effect, positions <p> for the next
                  * loop iteration */
@@ -14117,6 +14148,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                             U8 * new_s = uvchr_to_utf8((U8*)s, ender);
                             added_len = (char *) new_s - s;
                             s = (char *) new_s;
+
+                            if (ender > 255)  {
+                                requires_utf8_target = TRUE;
+                            }
                         }
                 }
                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
@@ -14163,14 +14198,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
                     /* Here, continuing a node with non-folded characters.  Add
                      * this one */
-
-                    if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
-                        *(s++) = (char) ender;
-                    }
-                    else {
-                        s = (char *) uvchr_to_utf8((U8 *) s, ender);
-                        added_len = UVCHR_SKIP(ender);
-                    }
+                    goto not_fold_common;
                 }
                 else {  /* Here, does participate in some fold */
 
@@ -14199,6 +14227,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                                                     ? FOLD_FLAGS_NOMIX_ASCII
                                                     : 0));
                             s += added_len;
+
+                            if (ender > 255)  {
+                                requires_utf8_target = TRUE;
+                                if (UNLIKELY(ender == GREEK_SMALL_LETTER_MU)) {
+                                    has_micro_sign = TRUE;
+                                }
+                            }
                         }
                     }
                     else {
@@ -14240,6 +14275,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                         }
 #endif
 
+                        else if (UNLIKELY(ender == MICRO_SIGN)) {
+                            has_micro_sign = TRUE;
+                        }
+
                         /* Even when folding, we store just the input
                          * character, as we have an array that finds its fold
                          * quickly */
@@ -14432,11 +14471,14 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                 OP(REGNODE_p(ret)) = node_type;
 
                 /* If the node type is EXACT here, check to see if it
-                 * should be EXACTL. */
+                 * should be EXACTL, or EXACT_ONLY8. */
                 if (node_type == EXACT) {
                     if (LOC) {
                         OP(REGNODE_p(ret)) = EXACTL;
                     }
+                    else if (requires_utf8_target) {
+                        OP(REGNODE_p(ret)) = EXACT_ONLY8;
+                    }
                 }
 
                 if (FOLD) {
@@ -14454,6 +14496,16 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                     else if (node_type == EXACTF) {
                         RExC_seen_d_op = TRUE;
                     }
+
+                    /* The micro sign is the only below 256 character that
+                     * folds to above 255 */
+                    if (   OP(REGNODE_p(ret)) == EXACTFU
+                        && requires_utf8_target
+                        && LIKELY(! has_micro_sign))
+                    {
+                        OP(REGNODE_p(ret)) = EXACTFU_ONLY8;
+                    }
+
                 }
 
                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
@@ -15637,7 +15689,6 @@ redo_curchar:
                     if (UCHARAT(RExC_parse) != ')')
                         vFAIL("Expecting close paren for wrapper for nested extended charclass");
 
-                    RExC_parse++;
                     RExC_flags = save_flags;
                     goto handle_operand;
                 }
@@ -16478,8 +16529,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
     bool warn_super = ALWAYS_WARN_SUPER;
 
-    const regnode_offset orig_emit = RExC_emit; /* Save the original RExC_emit in
-        case we need to change the emitted regop to an EXACT. */
     const char * orig_parse = RExC_parse;
     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
 
@@ -18103,7 +18152,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
     if (optimizable) {
         int posix_class = -1;   /* Illegal value */
-        const char * cur_parse= RExC_parse;
         U8 ANYOFM_mask = 0xFF;
         U32 anode_arg = 0;
         UV start, end;
@@ -18369,16 +18417,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         }
 
         if (op != END) {
-            RExC_parse = (char *)orig_parse;
-            RExC_emit = orig_emit;
-
             if (regarglen[op]) {
                 ret = reganode(pRExC_state, op, anode_arg);
             } else {
                 ret = reg_node(pRExC_state, op);
             }
-
-            RExC_parse = (char *)cur_parse;
+            Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
+                                                   RExC_parse - orig_parse);;
 
             if (PL_regkind[op] == EXACT) {
                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
@@ -19224,11 +19269,13 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
         if ( exact ) {
             switch (OP(REGNODE_p(scan))) {
                 case EXACT:
+                case EXACT_ONLY8:
                 case EXACTL:
                 case EXACTF:
                 case EXACTFAA_NO_TRIE:
                 case EXACTFAA:
                 case EXACTFU:
+                case EXACTFU_ONLY8:
                 case EXACTFLU8:
                 case EXACTFU_SS:
                 case EXACTFL: