This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
5.26.2 today
[perl5.git] / regexec.c
index 79cfdf4..1425508 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -175,32 +175,6 @@ static const char* const non_utf8_target_but_utf8_required
     locinput = (p);  \
     SET_nextchr
 
-
-#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START {   \
-        if (!swash_ptr) {                                                     \
-            U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
-            swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
-                                         1, 0, invlist, &flags);              \
-            assert(swash_ptr);                                                \
-        }                                                                     \
-    } STMT_END
-
-/* If in debug mode, we test that a known character properly matches */
-#ifdef DEBUGGING
-#   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
-                                          property_name,                      \
-                                          invlist,                            \
-                                          utf8_char_in_property)              \
-        LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist);               \
-        assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
-#else
-#   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
-                                          property_name,                      \
-                                          invlist,                            \
-                                          utf8_char_in_property)              \
-        LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
-#endif
-
 #define PLACEHOLDER    /* Something for the preprocessor to grab onto */
 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 
@@ -2249,7 +2223,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
     case ANYOFM:    /* ARG() is the base byte; FLAGS() the mask byte */
         /* UTF-8ness doesn't matter, so use 0 */
         REXEC_FBC_FIND_NEXT_SCAN(0,
-         (char *) find_next_masked((U8 *) s, (U8 *) strend, ARG(c), FLAGS(c)));
+         (char *) find_next_masked((U8 *) s, (U8 *) strend,
+                                   (U8) ARG(c), FLAGS(c)));
         break;
 
     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
@@ -4536,70 +4511,40 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
             if (c1 > 255) {
-                /* Load the folds hash, if not already done */
-                SV** listp;
-                if (! PL_utf8_foldclosures) {
-                    _load_PL_utf8_foldclosures();
+                const unsigned int * remaining_folds_to_list;
+                unsigned int first_folds_to;
+
+                /* Look up what code points (besides c1) fold to c1;  e.g.,
+                 * [ 'K', KELVIN_SIGN ] both fold to 'k'. */
+                Size_t folds_to_count = _inverse_folds(c1,
+                                                     &first_folds_to,
+                                                     &remaining_folds_to_list);
+                if (folds_to_count == 0) {
+                    c2 = c1;    /* there is only a single character that could
+                                   match */
                 }
-
-                /* The fold closures data structure is a hash with the keys
-                 * being the UTF-8 of every character that is folded to, like
-                 * 'k', and the values each an array of all code points that
-                 * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
-                 * Multi-character folds are not included */
-                if ((! (listp = hv_fetch(PL_utf8_foldclosures,
-                                        (char *) pat,
-                                        UTF8SKIP(pat),
-                                        FALSE))))
-                {
-                    /* Not found in the hash, therefore there are no folds
-                    * containing it, so there is only a single character that
-                    * could match */
-                    c2 = c1;
+                else if (folds_to_count != 1) {
+                    /* If there aren't exactly two folds to this (itself and
+                     * another), it is outside the scope of this function */
+                    use_chrtest_void = TRUE;
                 }
-                else {  /* Does participate in folds */
-                    AV* list = (AV*) *listp;
-                    if (av_tindex_skip_len_mg(list) != 1) {
-
-                        /* If there aren't exactly two folds to this, it is
-                         * outside the scope of this function */
-                        use_chrtest_void = TRUE;
-                    }
-                    else {  /* There are two.  Get them */
-                        SV** c_p = av_fetch(list, 0, FALSE);
-                        if (c_p == NULL) {
-                            Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
-                        }
-                        c1 = SvUV(*c_p);
-
-                        c_p = av_fetch(list, 1, FALSE);
-                        if (c_p == NULL) {
-                            Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
-                        }
-                        c2 = SvUV(*c_p);
-
-                        /* Folds that cross the 255/256 boundary are forbidden
-                         * if EXACTFL (and isnt a UTF8 locale), or EXACTFAA and
-                         * one is ASCIII.  Since the pattern character is above
-                         * 255, and its only other match is below 256, the only
-                         * legal match will be to itself.  We have thrown away
-                         * the original, so have to compute which is the one
-                         * above 255. */
-                        if ((c1 < 256) != (c2 < 256)) {
-                            if ((OP(text_node) == EXACTFL
-                                 && ! IN_UTF8_CTYPE_LOCALE)
-                                || ((OP(text_node) == EXACTFAA
-                                    || OP(text_node) == EXACTFAA_NO_TRIE)
-                                    && (isASCII(c1) || isASCII(c2))))
-                            {
-                                if (c1 < 256) {
-                                    c1 = c2;
-                                }
-                                else {
-                                    c2 = c1;
-                                }
-                            }
-                        }
+                else {  /* There are two.  We already have one, get the other */
+                    c2 = first_folds_to;
+
+                    /* Folds that cross the 255/256 boundary are forbidden if
+                     * EXACTFL (and isnt a UTF8 locale), or EXACTFAA and one is
+                     * ASCIII.  The only other match to c1 is c2, and since c1
+                     * is above 255, c2 better be as well under these
+                     * circumstances.  If it isn't, it means the only legal
+                     * match of c1 is itself. */
+                    if (    c2 < 256
+                        && (   (   OP(text_node) == EXACTFL
+                                && ! IN_UTF8_CTYPE_LOCALE)
+                            || ((     OP(text_node) == EXACTFAA
+                                   || OP(text_node) == EXACTFAA_NO_TRIE)
+                                && (isASCII(c1) || isASCII(c2)))))
+                    {
+                        c2 = c1;
                     }
                 }
             }
@@ -5905,7 +5850,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
              */
             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
                 DEBUG_EXECUTE_r(
-                    Perl_re_exec_indentf( aTHX_  "%sfailed to match trie start class...%s\n",
+                    Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
                               depth, PL_colors[4], PL_colors[5])
                 );
                 sayNO_SILENT;
@@ -5970,7 +5915,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
                     if (utf8_target
-                        && nextchr >= 0 /* guard against negative EOS value in nextchr */
+                        && ! NEXTCHR_IS_EOS
                         && UTF8_IS_ABOVE_LATIN1(nextchr)
                         && scan->flags == EXACTL)
                     {
@@ -5986,14 +5931,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 {
                    if (trie->states[ state ].wordnum) {
                         DEBUG_EXECUTE_r(
-                            Perl_re_exec_indentf( aTHX_  "%smatched empty string...%s\n",
+                            Perl_re_exec_indentf( aTHX_  "%sTRIE: matched empty string...%s\n",
                                           depth, PL_colors[4], PL_colors[5])
                         );
                        if (!trie->jump)
                            break;
                    } else {
                        DEBUG_EXECUTE_r(
-                            Perl_re_exec_indentf( aTHX_  "%sfailed to match trie start class...%s\n",
+                            Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
                                           depth, PL_colors[4], PL_colors[5])
                         );
                        sayNO_SILENT;
@@ -6049,7 +5994,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
                                 /* HERE */
                                 PerlIO_printf( Perl_debug_log,
-                                    "%*s%sState: %4" UVxf " Accepted: %c ",
+                                    "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
                                     INDENT_CHARS(depth), "", PL_colors[4],
                                    (UV)state, (accepted ? 'Y' : 'N'));
                    });
@@ -6083,7 +6028,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                    }
                    DEBUG_TRIE_EXECUTE_r(
                         Perl_re_printf( aTHX_
-                           "Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
+                           "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
                            charid, uvc, (UV)state, PL_colors[5] );
                    );
                }
@@ -6102,7 +6047,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                }
 
                DEBUG_EXECUTE_r(
-                    Perl_re_exec_indentf( aTHX_  "%sgot %" IVdf " possible matches%s\n",
+                    Perl_re_exec_indentf( aTHX_  "%sTRIE: got %" IVdf " possible matches%s\n",
                         depth,
                        PL_colors[4], (IV)ST.accepted, PL_colors[5] );
                );
@@ -6235,7 +6180,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
                SV *sv= tmp ? sv_newmortal() : NULL;
 
-                Perl_re_exec_indentf( aTHX_  "%sonly one match left, short-circuiting: #%d <%s>%s\n",
+                Perl_re_exec_indentf( aTHX_  "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
                     depth, PL_colors[4],
                    ST.nextword,
                    tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
@@ -7467,7 +7412,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                rei = RXi_GET(re);
                 DEBUG_EXECUTE_r(
                     debug_start_match(re_sv, utf8_target, locinput,
-                                    reginfo->strend, "Matching embedded");
+                                    reginfo->strend, "EVAL/GOSUB: Matching embedded");
                );              
                startpoint = rei->program + 1;
                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
@@ -7597,7 +7542,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
            if (n > maxopenparen)
                maxopenparen = n;
             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
-               "rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
+               "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
                 depth,
                PTR2UV(rex),
                PTR2UV(rex->offs),
@@ -7617,7 +7562,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
     rex->offs[n].start = rex->offs[n].start_tmp;                           \
     rex->offs[n].end = locinput - reginfo->strbeg;                         \
     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
-        "rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf "\n", \
+        "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf "\n", \
         depth,                                                             \
         PTR2UV(rex),                                                       \
         PTR2UV(rex->offs),                                                 \
@@ -7858,7 +7803,7 @@ NULL
            ST.cache_mask = 0;
            
 
-            DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: matched %ld out of %d..%d\n",
+            DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: matched %ld out of %d..%d\n",
                   depth, (long)n, min, max)
            );
 
@@ -7876,7 +7821,7 @@ NULL
            /* If degenerate A matches "", assume A done. */
 
            if (locinput == cur_curlyx->u.curlyx.lastloc) {
-                DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: empty match detected, trying continuation...\n",
+                DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: empty match detected, trying continuation...\n",
                    depth)
                );
                goto do_whilem_B_max;
@@ -7944,7 +7889,7 @@ NULL
                        Newxz(aux->poscache, size, char);
                    }
                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
-      "%swhilem: Detected a super-linear match, switching on caching%s...\n",
+      "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
                              PL_colors[4], PL_colors[5])
                    );
                }
@@ -7960,7 +7905,7 @@ NULL
                    mask    = 1 << (offset % 8);
                    offset /= 8;
                    if (reginfo->info_aux->poscache[offset] & mask) {
-                        DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: (cache) already tried at this position...\n",
+                        DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: (cache) already tried at this position...\n",
                             depth)
                        );
                         cur_curlyx->u.curlyx.count--;
@@ -8021,7 +7966,7 @@ NULL
        case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
            REGCP_UNWIND(ST.lastcp);
             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
-            DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "whilem: failed, trying continuation...\n",
+            DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: failed, trying continuation...\n",
                 depth)
            );
          do_whilem_B_max:
@@ -8062,7 +8007,7 @@ NULL
                CACHEsayNO;
            }
 
-            DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "trying longer...\n", depth)
+            DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: B min fail: trying longer...\n", depth)
            );
            /* Try grabbing another A and see if it helps. */
            cur_curlyx->u.curlyx.lastloc = locinput;
@@ -8693,7 +8638,7 @@ NULL
                st->u.eval.prev_eval = cur_eval;
                 cur_eval = CUR_EVAL.prev_eval;
                DEBUG_EXECUTE_r(
-                    Perl_re_exec_indentf( aTHX_  "EVAL trying tail ... (cur_eval=%p)\n",
+                    Perl_re_exec_indentf( aTHX_  "END: EVAL trying tail ... (cur_eval=%p)\n",
                                       depth, cur_eval););
                 if ( nochange_depth )
                    nochange_depth--;
@@ -8706,7 +8651,7 @@ NULL
 
            if (locinput < reginfo->till) {
                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
-                                      "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
+                                      "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
                                      PL_colors[4],
                                      (long)(locinput - startpos),
                                      (long)(reginfo->till - startpos),
@@ -8718,7 +8663,7 @@ NULL
 
        case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
            DEBUG_EXECUTE_r(
-            Perl_re_exec_indentf( aTHX_  "%ssubpattern success...%s\n",
+            Perl_re_exec_indentf( aTHX_  "%sSUCCEED: subpattern success...%s\n",
                 depth, PL_colors[4], PL_colors[5]));
            sayYES;                     /* Success! */
 
@@ -8853,7 +8798,7 @@ NULL
                 sv_commit = ST.mark_name;
 
                 DEBUG_EXECUTE_r({
-                        Perl_re_exec_indentf( aTHX_  "%ssetting cutpoint to mark:%" SVf "...%s\n",
+                        Perl_re_exec_indentf( aTHX_  "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
                             depth,
                            PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
                });