X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/86897aa65fb10f5652fed7562f006ae8a470c661..a482c76f48270e80f77784f6a08d244af9b26d7e:/regcomp.c diff --git a/regcomp.c b/regcomp.c index d992be7..d2f95c2 100644 --- a/regcomp.c +++ b/regcomp.c @@ -159,7 +159,8 @@ struct RExC_state_t { within pattern */ int num_code_blocks; /* size of code_blocks[] */ int code_index; /* next code_blocks[] slot */ -#if ADD_TO_REGEXEC + SSize_t maxlen; /* mininum possible number of chars in string to match */ +#ifdef ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) #endif @@ -196,6 +197,7 @@ struct RExC_state_t { #define RExC_sawback (pRExC_state->sawback) #define RExC_seen (pRExC_state->seen) #define RExC_size (pRExC_state->size) +#define RExC_maxlen (pRExC_state->maxlen) #define RExC_npar (pRExC_state->npar) #define RExC_nestroot (pRExC_state->nestroot) #define RExC_extralen (pRExC_state->extralen) @@ -741,38 +743,41 @@ static const scan_data_t zero_scan_data = DEBUG_OPTIMISE_MORE_r({ \ PerlIO_printf(Perl_debug_log,"RExC_seen: "); \ \ - if (RExC_seen & REG_SEEN_ZERO_LEN) \ - PerlIO_printf(Perl_debug_log,"REG_SEEN_ZERO_LEN "); \ + if (RExC_seen & REG_ZERO_LEN_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \ \ - if (RExC_seen & REG_SEEN_LOOKBEHIND) \ - PerlIO_printf(Perl_debug_log,"REG_SEEN_LOOKBEHIND "); \ + if (RExC_seen & REG_LOOKBEHIND_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \ \ - if (RExC_seen & REG_SEEN_GPOS) \ - PerlIO_printf(Perl_debug_log,"REG_SEEN_GPOS "); \ + if (RExC_seen & REG_GPOS_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \ \ - if (RExC_seen & REG_SEEN_CANY) \ - PerlIO_printf(Perl_debug_log,"REG_SEEN_CANY "); \ + if (RExC_seen & REG_CANY_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \ \ - if (RExC_seen & REG_SEEN_RECURSE) \ - PerlIO_printf(Perl_debug_log,"REG_SEEN_RECURSE "); \ + if (RExC_seen & REG_RECURSE_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \ \ - if (RExC_seen & REG_TOP_LEVEL_BRANCHES) \ - PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES "); \ + if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \ \ - if (RExC_seen & REG_SEEN_VERBARG) \ - PerlIO_printf(Perl_debug_log,"REG_SEEN_VERBARG "); \ + if (RExC_seen & REG_VERBARG_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \ \ - if (RExC_seen & REG_SEEN_CUTGROUP) \ - PerlIO_printf(Perl_debug_log,"REG_SEEN_CUTGROUP "); \ + if (RExC_seen & REG_CUTGROUP_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \ \ - if (RExC_seen & REG_SEEN_RUN_ON_COMMENT) \ - PerlIO_printf(Perl_debug_log,"REG_SEEN_RUN_ON_COMMENT "); \ + if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \ \ - if (RExC_seen & REG_SEEN_UNFOLDED_MULTI) \ - PerlIO_printf(Perl_debug_log,"REG_SEEN_UNFOLDED_MULTI "); \ + if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \ \ - if (RExC_seen & REG_SEEN_GOSTART) \ - PerlIO_printf(Perl_debug_log,"REG_SEEN_GOSTART "); \ + if (RExC_seen & REG_GOSTART_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \ + \ + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \ \ PerlIO_printf(Perl_debug_log,"\n"); \ }); @@ -919,7 +924,7 @@ S_ssc_is_anything(pTHX_ const regnode_ssc *ssc) } /* If e.g., both \w and \W are set, matches everything */ - if (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) { + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { int i; for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) { if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) { @@ -953,7 +958,6 @@ S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) * necessary. */ if (RExC_contains_locale) { ANYOF_POSIXL_SETALL(ssc); - ANYOF_FLAGS(ssc) |= ANYOF_LOCALE|ANYOF_POSIXL; } else { ANYOF_POSIXL_ZERO(ssc); @@ -986,11 +990,7 @@ S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state, return FALSE; } - if (RExC_contains_locale - && ! ((ANYOF_FLAGS(ssc) & ANYOF_LOCALE) - || ! (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) - || ! ANYOF_POSIXL_TEST_ALL_SET(ssc))) - { + if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) { return FALSE; } @@ -999,7 +999,7 @@ S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state, STATIC SV* S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, - const regnode_charclass_posixl_fold* const node) + const regnode_charclass* const node) { /* Returns a mortal inversion list defining which code points are matched * by 'node', which is of type ANYOF. Handles complementing the result if @@ -1008,6 +1008,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, * possibility. */ SV* invlist = sv_2mortal(_new_invlist(0)); + SV* only_utf8_locale_invlist = NULL; unsigned int i; const U32 n = ARG(node); bool new_node_has_latin1 = FALSE; @@ -1030,11 +1031,18 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, * known until runtime -- we have to assume it could be anything */ return _add_range_to_invlist(invlist, 0, UV_MAX); } - else { + else if (ary[3] && ary[3] != &PL_sv_undef) { /* Here no compile-time swash, and no run-time only data. Use the * node's inversion list */ - invlist = sv_2mortal(invlist_clone(ary[2])); + invlist = sv_2mortal(invlist_clone(ary[3])); + } + + /* Get the code points valid only under UTF-8 locales */ + if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) + && ary[2] && ary[2] != &PL_sv_undef) + { + only_utf8_locale_invlist = ary[2]; } } @@ -1082,11 +1090,12 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, _invlist_union(invlist, PL_Latin1, &invlist); } - /* Similarly add the UTF-8 locale possible matches */ - if (ANYOF_FLAGS(node) & ANYOF_LOC_FOLD && ANYOF_UTF8_LOCALE_INVLIST(node)) - { + /* Similarly add the UTF-8 locale possible matches. These have to be + * deferred until after the non-UTF-8 locale ones are taken care of just + * above, or it leads to wrong results under ANYOF_INVERT */ + if (only_utf8_locale_invlist) { _invlist_union_maybe_complement_2nd(invlist, - ANYOF_UTF8_LOCALE_INVLIST(node), + only_utf8_locale_invlist, ANYOF_FLAGS(node) & ANYOF_INVERT, &invlist); } @@ -1100,30 +1109,13 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) -STATIC void -S_ssc_flags_and(regnode_ssc *ssc, const U8 and_with) -{ - /* Take the flags 'and_with' and accumulate them anded into the flags for - * the SSC 'ssc'. The non-SSC related flags in 'and_with' are ignored. - * The flags 'and_with' should not come from another SSC (otherwise the - * EMPTY_STRING flag won't work) */ - - const U8 ssc_only_flags = ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS; - - PERL_ARGS_ASSERT_SSC_FLAGS_AND; - - /* Use just the SSC-related flags from 'and_with' */ - ANYOF_FLAGS(ssc) &= (and_with & ANYOF_COMMON_FLAGS); - ANYOF_FLAGS(ssc) |= ssc_only_flags; -} - /* 'AND' a given class with another one. Can create false positives. 'ssc' * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if * 'and_with' is a regnode_charclass instead of a regnode_ssc. */ STATIC void S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, - const regnode_ssc *and_with) + const regnode_charclass *and_with) { /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either * another SSC or a regular ANYOF class. Can create false positives. */ @@ -1138,7 +1130,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract * the code point inversion list and just the relevant flags */ if (is_ANYOF_SYNTHETIC(and_with)) { - anded_cp_list = and_with->invlist; + anded_cp_list = ((regnode_ssc *)and_with)->invlist; anded_flags = ANYOF_FLAGS(and_with); /* XXX This is a kludge around what appears to be deficiencies in the @@ -1156,13 +1148,12 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, * creates bugs, the consequences are only that a warning isn't raised * that should be; while the consequences for having /l bugs is * incorrect matches */ - if (ssc_is_anything(and_with)) { + if (ssc_is_anything((regnode_ssc *)and_with)) { anded_flags |= ANYOF_WARN_SUPER; } } else { - anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, - (regnode_charclass_posixl_fold*) and_with); + anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; } @@ -1214,7 +1205,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) { ANYOF_POSIXL_ZERO(ssc); } - else if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) { + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { /* Note that the Posix class component P from 'and_with' actually * looks like: @@ -1239,16 +1230,16 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, * standard, in particular almost everything by Microsoft. * The loop below just changes e.g., \w into \W and vice versa */ - regnode_charclass_posixl_fold temp; + regnode_charclass_posixl temp; int add = 1; /* To calculate the index of the complement */ ANYOF_POSIXL_ZERO(&temp); for (i = 0; i < ANYOF_MAX; i++) { assert(i % 2 != 0 - || ! ANYOF_POSIXL_TEST(and_with, i) - || ! ANYOF_POSIXL_TEST(and_with, i + 1)); + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i) + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1)); - if (ANYOF_POSIXL_TEST(and_with, i)) { + if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) { ANYOF_POSIXL_SET(&temp, i + add); } add = 0 - add; /* 1 goes to -1; -1 goes to 1 */ @@ -1259,7 +1250,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC in its initial state */ else if (! is_ANYOF_SYNTHETIC(and_with) - || ! ssc_is_cp_posixl_init(pRExC_state, and_with)) + || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with)) { /* But if 'ssc' is in its initial state, the result is just 'and_with'; * copy it over 'ssc' */ @@ -1271,15 +1262,17 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, ssc->invlist = anded_cp_list; ANYOF_POSIXL_ZERO(ssc); if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { - ANYOF_POSIXL_OR(and_with, ssc); + ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); } } } - else if ((ANYOF_FLAGS(ssc) & ANYOF_POSIXL) - || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) + || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) { /* One or the other of P1, P2 is non-empty. */ - ANYOF_POSIXL_AND(and_with, ssc); + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); + } ssc_union(ssc, anded_cp_list, FALSE); } else { /* P1 = P2 = empty */ @@ -1290,7 +1283,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, STATIC void S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, - const regnode_ssc *or_with) + const regnode_charclass *or_with) { /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either * another SSC or a regular ANYOF class. Can create false positives if @@ -1306,12 +1299,11 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract * the code point inversion list and just the relevant flags */ if (is_ANYOF_SYNTHETIC(or_with)) { - ored_cp_list = or_with->invlist; + ored_cp_list = ((regnode_ssc*) or_with)->invlist; ored_flags = ANYOF_FLAGS(or_with); } else { - ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, - (regnode_charclass_posixl_fold*) or_with); + ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; } @@ -1339,10 +1331,10 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, && ! is_ANYOF_SYNTHETIC(or_with)) { /* We ignore P2, leaving P1 going forward */ - } - else { /* Not inverted */ - ANYOF_POSIXL_OR(or_with, ssc); - if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) { + } /* else Not inverted */ + else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { unsigned int i; for (i = 0; i < ANYOF_MAX; i += 2) { if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1)) @@ -1350,9 +1342,6 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, ssc_match_all_cp(ssc); ANYOF_POSIXL_CLEAR(ssc, i); ANYOF_POSIXL_CLEAR(ssc, i+1); - if (! ANYOF_POSIXL_TEST_ANY_SET(ssc)) { - ANYOF_FLAGS(ssc) &= ~ANYOF_POSIXL; - } } } } @@ -1453,13 +1442,17 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); - set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE); + set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, + NULL, NULL, NULL, FALSE); - /* The code points that could match under /li are already incorporated into - * the inversion list and bit map */ - ANYOF_FLAGS(ssc) &= ~ANYOF_LOC_FOLD; + /* Make sure is clone-safe */ + ssc->invlist = NULL; - assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) || RExC_contains_locale); + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + ANYOF_FLAGS(ssc) |= ANYOF_POSIXL; + } + + assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); } #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] @@ -2061,10 +2054,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *noper = NEXTOPER( cur ); const U8 *uc = (U8*)STRING( noper ); const U8 *e = uc + STR_LEN( noper ); - STRLEN foldlen = 0; + int foldlen = 0; U32 wordlen = 0; /* required init */ - STRLEN minbytes = 0; - STRLEN maxbytes = 0; + STRLEN minchars = 0; + STRLEN maxchars = 0; bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/ @@ -2089,58 +2082,74 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); } } - for ( ; uc < e ; uc += len ) { + for ( ; uc < e ; uc += len ) { /* Look at each char in the current + branch */ TRIE_CHARCOUNT(trie)++; TRIE_READ_CHAR; - /* Acummulate to the current values, the range in the number of - * bytes that this character could match. The max is presumed to - * be the same as the folded input (which TRIE_READ_CHAR returns), - * except that when this is not in UTF-8, it could be matched - * against a string which is UTF-8, and the variant characters - * could be 2 bytes instead of the 1 here. Likewise, for the - * minimum number of bytes when not folded. When folding, the min - * is assumed to be 1 byte could fold to match the single character - * here, or in the case of a multi-char fold, 1 byte can fold to - * the whole sequence. 'foldlen' is used to denote whether we are - * in such a sequence, skipping the min setting if so. XXX TODO - * Use the exact list of what folds to each character, from - * PL_utf8_foldclosures */ - if (UTF) { - maxbytes += UTF8SKIP(uc); - if (! folder) { - /* A non-UTF-8 string could be 1 byte to match our 2 */ - minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc)) - ? 1 - : UTF8SKIP(uc); - } - else { - if (foldlen) { - foldlen -= UTF8SKIP(uc); - } - else { - foldlen = is_MULTI_CHAR_FOLD_utf8(uc); - minbytes++; - } - } + /* TRIE_READ_CHAR returns the current character, or its fold if /i + * is in effect. Under /i, this character can match itself, or + * anything that folds to it. If not under /i, it can match just + * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN + * all fold to k, and all are single characters. But some folds + * expand to more than one character, so for example LATIN SMALL + * LIGATURE FFI folds to the three character sequence 'ffi'. If + * the string beginning at 'uc' is 'ffi', it could be matched by + * three characters, or just by the one ligature character. (It + * could also be matched by two characters: LATIN SMALL LIGATURE FF + * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI). + * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also + * match.) The trie needs to know the minimum and maximum number + * of characters that could match so that it can use size alone to + * quickly reject many match attempts. The max is simple: it is + * the number of folded characters in this branch (since a fold is + * never shorter than what folds to it. */ + + maxchars++; + + /* And the min is equal to the max if not under /i (indicated by + * 'folder' being NULL), or there are no multi-character folds. If + * there is a multi-character fold, the min is incremented just + * once, for the character that folds to the sequence. Each + * character in the sequence needs to be added to the list below of + * characters in the trie, but we count only the first towards the + * min number of characters needed. This is done through the + * variable 'foldlen', which is returned by the macros that look + * for these sequences as the number of bytes the sequence + * occupies. Each time through the loop, we decrement 'foldlen' by + * how many bytes the current char occupies. Only when it reaches + * 0 do we increment 'minchars' or look for another multi-character + * sequence. */ + if (folder == NULL) { + minchars++; + } + else if (foldlen > 0) { + foldlen -= (UTF) ? UTF8SKIP(uc) : 1; } else { - maxbytes += (UNI_IS_INVARIANT(*uc)) - ? 1 - : 2; - if (! folder) { - minbytes++; - } - else { - if (foldlen) { - foldlen--; - } - else { - foldlen = is_MULTI_CHAR_FOLD_latin1(uc); - minbytes++; + minchars++; + + /* See if *uc is the beginning of a multi-character fold. If + * so, we decrement the length remaining to look at, to account + * for the current character this iteration. (We can use 'uc' + * instead of the fold returned by TRIE_READ_CHAR because for + * non-UTF, the latin1_safe macro is smart enough to account + * for all the unfolded characters, and because for UTF, the + * string will already have been folded earlier in the + * compilation process */ + if (UTF) { + if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { + foldlen -= UTF8SKIP(uc); } } + else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) { + foldlen--; + } } + + /* The current character (and any potential folds) should be added + * to the possible matching characters for this position in this + * branch */ if ( uvc < 256 ) { if ( folder ) { U8 folded= folder[ (U8) uvc ]; @@ -2171,6 +2180,14 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, set_bit = 0; /* We've done our bit :-) */ } } else { + + /* XXX We could come up with the list of code points that fold + * to this using PL_utf8_foldclosures, except not for + * multi-char folds, as there may be multiple combinations + * there that could work, which needs to wait until runtime to + * resolve (The comment about LIGATURE FFI above is such an + * example */ + SV** svpp; if ( !widecharmap ) widecharmap = newHV(); @@ -2185,14 +2202,17 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, TRIE_STORE_REVCHAR(uvc); } } - } + } /* end loop through characters in this branch of the trie */ + + /* We take the min and max for this branch and combine to find the min + * and max for all branches processed so far */ if( cur == first ) { - trie->minlen = minbytes; - trie->maxlen = maxbytes; - } else if (minbytes < trie->minlen) { - trie->minlen = minbytes; - } else if (maxbytes > trie->maxlen) { - trie->maxlen = maxbytes; + trie->minlen = minchars; + trie->maxlen = maxchars; + } else if (minchars < trie->minlen) { + trie->minlen = minchars; + } else if (maxchars > trie->maxlen) { + trie->maxlen = maxchars; } } /* end first pass */ DEBUG_TRIE_COMPILE_r( @@ -3072,7 +3092,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode PerlIO_printf(Perl_debug_log, "\n"); }); Safefree(q); - /*RExC_seen |= REG_SEEN_TRIEDFA;*/ + /*RExC_seen |= REG_TRIEDFA_SEEN;*/ } @@ -3080,7 +3100,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode DEBUG_OPTIMISE_r({if (scan){ \ SV * const mysv=sv_newmortal(); \ regnode *Next = regnext(scan); \ - regprop(RExC_rx, mysv, scan); \ + regprop(RExC_rx, mysv, scan, NULL); \ PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \ (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\ Next ? (REG_NODE_NUM(Next)) : 0 ); \ @@ -3375,7 +3395,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, length sequence we are looking for is 2 */ { int count = 0; /* How many characters in a multi-char fold */ - int len = is_MULTI_CHAR_FOLD_utf8(s); + int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); if (! len) { /* Not a multi-char fold: get next char */ s += UTF8SKIP(s); continue; @@ -3468,7 +3488,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, : s_end -1; while (s < upper) { - int len = is_MULTI_CHAR_FOLD_latin1(s); + int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); if (! len) { /* Not a multi-char fold. */ if (*s == LATIN_SMALL_LETTER_SHARP_S && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) @@ -3540,8 +3560,6 @@ typedef struct scan_frame { } scan_frame; -#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) - STATIC SSize_t S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SSize_t *minlenp, SSize_t *deltap, @@ -3620,9 +3638,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, DEBUG_PEEP("Peep", scan, depth); - /* Its not clear to khw or hv why this is done here, and not in the - * clauses that deal with EXACT nodes. khw's guess is that it's - * because of a previous design */ + /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/ + * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled + * by a different invocation of reg() -- Yves + */ JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0); /* Follow the next-chain of the current node and optimize @@ -3667,11 +3686,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, regnode_ssc accum; regnode * const startbranch=scan; - if (flags & SCF_DO_SUBSTR) - SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge - strings after - this. */ - if (flags & SCF_DO_STCLASS) + if (flags & SCF_DO_SUBSTR) { + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } + + if (flags & SCF_DO_STCLASS) ssc_init_zero(pRExC_state, &accum); while (OP(scan) == code) { @@ -3728,7 +3748,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->whilem_c = data_fake.whilem_c; } if (flags & SCF_DO_STCLASS) - ssc_or(pRExC_state, &accum, &this_class); + ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class); } if (code == IFTHEN && num < 2) /* Empty ELSE branch */ min1 = 0; @@ -3748,15 +3768,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - ssc_or(pRExC_state, data->start_class, &accum); + ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum); if (min1) { - ssc_and(pRExC_state, data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } else if (flags & SCF_DO_STCLASS_AND) { if (min1) { - ssc_and(pRExC_state, data->start_class, &accum); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); flags &= ~SCF_DO_STCLASS; } else { @@ -3770,8 +3790,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } - if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) - == BRANCH ) + if (PERL_ENABLE_TRIE_OPTIMISATION && + OP( startbranch ) == BRANCH ) { /* demq. @@ -3848,7 +3868,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, tail ); + regprop(RExC_rx, mysv, tail, NULL); PerlIO_printf( Perl_debug_log, "%*s%s%s\n", (int)depth * 2 + 2, "", "Looking for TRIE'able sequences. Tail node is: ", @@ -3929,16 +3949,16 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); - regprop(RExC_rx, mysv, noper); + regprop(RExC_rx, mysv, noper, NULL); PerlIO_printf( Perl_debug_log, " -> %s", SvPV_nolen_const(mysv)); if ( noper_next ) { - regprop(RExC_rx, mysv, noper_next ); + regprop(RExC_rx, mysv, noper_next, NULL); PerlIO_printf( Perl_debug_log,"\t=> %s\t", SvPV_nolen_const(mysv)); } @@ -4037,7 +4057,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } /* end handle unmergable node */ } /* loop over branches */ DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, "%*s- %s (%d) \n", (int)depth * 2 + 2, @@ -4061,7 +4081,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( startbranch == first && scan == tail ) { - RExC_seen &=~REG_TOP_LEVEL_BRANCHES; + RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; } } #endif @@ -4077,7 +4097,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * something like this: (?:|) So we can * turn it into a plain NOTHING op. */ DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, "%*s- %s (%d) \n", (int)depth * 2 + 2, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); @@ -4140,7 +4160,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* some form of infinite recursion, assume infinite length * */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; @@ -4221,7 +4241,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else if (flags & SCF_DO_STCLASS_OR) { ssc_add_cp(data->start_class, uc); - ssc_and(pRExC_state, data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; @@ -4237,7 +4257,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* Search for fixed substrings supports EXACT only. */ if (flags & SCF_DO_SUBSTR) { assert(data); - SCAN_COMMIT(pRExC_state, data, minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); } if (UTF) { const U8 * const s = (U8 *)STRING(scan); @@ -4245,7 +4265,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, l = utf8_length(s, s + l); } if (unfolded_multi_char) { - RExC_seen |= REG_SEEN_UNFOLDED_MULTI; + RExC_seen |= REG_UNFOLDED_MULTI_SEEN; } min += l - min_subtract; assert (min >= 0); @@ -4261,12 +4281,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } if (OP(scan) == EXACTFL) { - if (flags & SCF_DO_STCLASS_AND) { - ssc_flags_and(data->start_class, ANYOF_LOCALE); - } - else if (flags & SCF_DO_STCLASS_OR) { - ANYOF_FLAGS(data->start_class) |= ANYOF_LOCALE; - } /* We don't know what the folds are; it could be anything. XXX * Actually, we only support UTF-8 encoding for code points @@ -4341,7 +4355,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else if (flags & SCF_DO_STCLASS_OR) { ssc_union(data->start_class, EXACTF_invlist, FALSE); - ssc_and(pRExC_state, data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; @@ -4384,13 +4398,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, scan = NEXTOPER(scan); goto do_curly; } - is_inf = is_inf_internal = 1; - scan = regnext(scan); if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state, data, minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); /* Cannot extend fixed substrings */ data->longest = &(data->longest_float); } + is_inf = is_inf_internal = 1; + scan = regnext(scan); goto optimize_curly_tail; case CURLY: if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) @@ -4411,7 +4425,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, next_is_eval = (OP(scan) == EVAL); do_curly: if (flags & SCF_DO_SUBSTR) { - if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); + if (mincount == 0) + scan_commit(pRExC_state, data, minlenp, is_inf); /* Cannot extend fixed substrings */ pos_before = data->pos_min; } @@ -4452,7 +4467,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->start_class = oclass; if (mincount == 0 || minnext == 0) { if (flags & SCF_DO_STCLASS_OR) { - ssc_or(pRExC_state, data->start_class, &this_class); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); } else if (flags & SCF_DO_STCLASS_AND) { /* Switch to OR mode: cache the old value of @@ -4466,11 +4481,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } else { /* Non-zero len */ if (flags & SCF_DO_STCLASS_OR) { - ssc_or(pRExC_state, data->start_class, &this_class); - ssc_and(pRExC_state, data->start_class, and_withp); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); } else if (flags & SCF_DO_STCLASS_AND) - ssc_and(pRExC_state, data->start_class, &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class); flags &= ~SCF_DO_STCLASS; } if (!scan) /* It was not CURLYX, but CURLY. */ @@ -4494,12 +4509,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, is_inf_internal |= deltanext == SSize_t_MAX || (maxcount == REG_INFTY && minnext + deltanext > 0); is_inf |= is_inf_internal; - if (is_inf) + if (is_inf) { delta = SSize_t_MAX; - else + } else { delta += (minnext + deltanext) * maxcount - minnext * mincount; - + } /* Try powerful optimization CURLYX => CURLYN. */ if ( OP(oscan) == CURLYX && data && data->flags & SF_IN_PAR @@ -4553,7 +4568,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* Nor characters whose fold at run-time may be * multi-character */ - && ! (RExC_seen & REG_SEEN_UNFOLDED_MULTI) + && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN) ) { /* XXXX How to optimize if data == 0? */ /* Optimize to a simpler form. */ @@ -4625,6 +4640,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, pars++; if (flags & SCF_DO_SUBSTR) { SV *last_str = NULL; + STRLEN last_chrs = 0; int counted = mincount != 0; if (data->last_end > 0 && mincount != 0) { /* Ends with a @@ -4640,9 +4656,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, l -= old; /* Get the added string: */ last_str = newSVpvn_utf8(s + old, l, UTF); + last_chrs = UTF ? utf8_length((U8*)(s + old), + (U8*)(s + old + l)) : l; if (deltanext == 0 && pos_before == b) { /* What was added is a constant string */ if (mincount > 1) { + SvGROW(last_str, (mincount * l) + 1); repeatcpy(SvPVX(last_str) + l, SvPVX_const(last_str), l, @@ -4658,8 +4677,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) - mg->mg_len += CHR_SVLEN(last_str) - l; + mg->mg_len += last_chrs * (mincount-1); } + last_chrs *= mincount; data->last_end += l * (mincount - 1); } } else { @@ -4691,7 +4711,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", if (mincount != maxcount) { /* Cannot extend fixed substrings found inside the group. */ - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); if (mincount && last_str) { SV * const sv = data->last_found; MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? @@ -4701,12 +4721,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", mg->mg_len = -1; sv_setsv(sv, last_str); data->last_end = data->pos_min; - data->last_start_min = - data->pos_min - CHR_SVLEN(last_str); + data->last_start_min = data->pos_min - last_chrs; data->last_start_max = is_inf ? SSize_t_MAX - : data->pos_min + data->pos_delta - - CHR_SVLEN(last_str); + : data->pos_min + data->pos_delta - last_chrs; } data->longest = &(data->longest_float); } @@ -4730,8 +4748,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", case REF: case CLUMP: if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect - anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; @@ -4761,7 +4779,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", ssc_union(data->start_class, PL_XPosix_ptrs[_CC_VERTSPACE], FALSE); - ssc_and(pRExC_state, data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); /* See commit msg for * 749e076fceedeb708a624933726e7989f2302f6a */ @@ -4772,8 +4790,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", min++; delta++; /* Because of the 2 char string cr-lf */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect - anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min += 1; data->pos_delta += 1; data->longest = &(data->longest_float); @@ -4782,7 +4800,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", else if (REGNODE_SIMPLE(OP(scan))) { if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min++; } min++; @@ -4835,10 +4853,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", case ANYOF: if (flags & SCF_DO_STCLASS_AND) ssc_and(pRExC_state, data->start_class, - (regnode_ssc*) scan); + (regnode_charclass *) scan); else ssc_or(pRExC_state, data->start_class, - (regnode_ssc*)scan); + (regnode_charclass *) scan); break; case NPOSIXL: @@ -4873,16 +4891,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", ssc_match_all_cp(data->start_class); ANYOF_POSIXL_CLEAR(data->start_class, namedclass); ANYOF_POSIXL_CLEAR(data->start_class, complement); - if (! ANYOF_POSIXL_TEST_ANY_SET(data->start_class)) - { - ANYOF_FLAGS(data->start_class) &= ~ANYOF_POSIXL; - } } else { /* The usual case; just add this class to the existing set */ ANYOF_POSIXL_SET(data->start_class, namedclass); - ANYOF_FLAGS(data->start_class) - |= ANYOF_LOCALE|ANYOF_POSIXL; } } break; @@ -4912,10 +4924,12 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", /* NPOSIXD matches all upper Latin1 code points unless the * target string being matched is UTF-8, which is - * unknowable until match time */ - if (PL_regkind[OP(scan)] == NPOSIXD) { - _invlist_union_complement_2nd(my_invlist, - PL_XPosix_ptrs[_CC_ASCII], &my_invlist); + * unknowable until match time. Since we are going to + * invert, we want to get rid of all of them so that the + * inversion will match all */ + if (OP(scan) == NPOSIXD) { + _invlist_subtract(my_invlist, PL_UpperLatin1, + &my_invlist); } join_posix: @@ -4930,7 +4944,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", } } if (flags & SCF_DO_STCLASS_OR) - ssc_and(pRExC_state, data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } @@ -4938,7 +4952,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", data->flags |= (OP(scan) == MEOL ? SF_BEFORE_MEOL : SF_BEFORE_SEOL); - SCAN_COMMIT(pRExC_state, data, minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); } else if ( PL_regkind[OP(scan)] == BRANCHJ @@ -4957,7 +4971,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", DEBUG_STUDYDATA("OPFAIL",data,depth); /*DEBUG_PARSE_MSG("opfail");*/ - regprop(RExC_rx, mysv_val, upto); + regprop(RExC_rx, mysv_val, upto, NULL); PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", SvPV_nolen_const(mysv_val), @@ -5033,7 +5047,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", ssc_init(pRExC_state, data->start_class); } else { /* AND before and after: combine and continue */ - ssc_and(pRExC_state, data->start_class, &intrnl); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); } } } @@ -5065,7 +5079,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", if ((flags & SCF_DO_SUBSTR) && data->last_found) { f |= SCF_DO_SUBSTR; if (scan->flags) - SCAN_COMMIT(pRExC_state, &data_fake,minlenp); + scan_commit(pRExC_state, &data_fake, minlenp, is_inf); data_fake.last_found=newSVsv(data->last_found); } } @@ -5104,7 +5118,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", *minnextp += min; if (f & SCF_DO_STCLASS_AND) { - ssc_and(pRExC_state, data->start_class, &intrnl); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); } if (data) { if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) @@ -5115,7 +5129,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { if (RExC_rx->minlen<*minnextp) RExC_rx->minlen=*minnextp; - SCAN_COMMIT(pRExC_state, &data_fake, minnextp); + scan_commit(pRExC_state, &data_fake, minnextp, is_inf); SvREFCNT_dec_NN(data_fake.last_found); if ( data_fake.minlen_fixed != minlenp ) @@ -5159,7 +5173,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", } else if ( PL_regkind[OP(scan)] == ENDLIKE ) { if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); flags &= ~SCF_DO_SUBSTR; } if (data && OP(scan)==ACCEPT) { @@ -5171,7 +5185,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ { if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; @@ -5204,9 +5218,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", SSize_t max1 = 0, min1 = SSize_t_MAX; regnode_ssc accum; - if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ - SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings - after this. */ + if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */ + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } if (flags & SCF_DO_STCLASS) ssc_init_zero(pRExC_state, &accum); @@ -5275,7 +5290,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", data->whilem_c = data_fake.whilem_c; } if (flags & SCF_DO_STCLASS) - ssc_or(pRExC_state, &accum, &this_class); + ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class); } } if (flags & SCF_DO_SUBSTR) { @@ -5287,15 +5302,15 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", min += min1; delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - ssc_or(pRExC_state, data->start_class, &accum); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); if (min1) { - ssc_and(pRExC_state, data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } else if (flags & SCF_DO_STCLASS_AND) { if (min1) { - ssc_and(pRExC_state, data->start_class, &accum); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); flags &= ~SCF_DO_STCLASS; } else { @@ -5320,8 +5335,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", delta += (trie->maxlen - trie->minlen); flags &= ~SCF_DO_STCLASS; /* xxx */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect - anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min += trie->minlen; data->pos_delta += (trie->maxlen - trie->minlen); if (trie->maxlen != trie->minlen) @@ -5364,6 +5379,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", *scanp = scan; *deltap = is_inf_internal ? SSize_t_MAX : delta; + if (flags & SCF_DO_SUBSTR && is_inf) data->pos_delta = SSize_t_MAX - data->pos_min; if (is_par > (I32)U8_MAX) @@ -5377,13 +5393,21 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", data->flags &= ~SF_IN_PAR; } if (flags & SCF_DO_STCLASS_OR) - ssc_and(pRExC_state, data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); if (flags & SCF_TRIE_RESTUDY) data->flags |= SCF_TRIE_RESTUDY; DEBUG_STUDYDATA("post-fin:",data,depth); - return min < stopmin ? min : stopmin; + { + SSize_t final_minlen= min < stopmin ? min : stopmin; + + if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) { + RExC_maxlen = final_minlen + delta; + } + return final_minlen; + } + /* not-reached */ } STATIC U32 @@ -6049,8 +6073,8 @@ S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, || (eol /* Can't have SEOL and MULTI */ && (! meol || (RExC_flags & RXf_PMf_MULTILINE))) ) - /* See comments for join_exact for why REG_SEEN_UNFOLDED_MULTI */ - || (RExC_seen & REG_SEEN_UNFOLDED_MULTI)) + /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */ + || (RExC_seen & REG_UNFOLDED_MULTI_SEEN)) { return FALSE; } @@ -6337,10 +6361,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, if (rx_flags & PMf_FOLD) { RExC_contains_i = 1; } - if (initial_charset == REGEX_LOCALE_CHARSET) { - RExC_contains_locale = 1; - } - else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { + if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { /* Set to use unicode semantics if the pattern is in utf8 and has the * 'depends' charset specified, as it means unicode when utf8 */ @@ -6368,6 +6389,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_sawback = 0; RExC_seen = 0; + RExC_maxlen = 0; RExC_in_lookbehind = 0; RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_extralen = 0; @@ -6508,8 +6530,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, bool has_default = (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) || ! has_charset); - bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT) - == REG_SEEN_RUN_ON_COMMENT); + bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN) + == REG_RUN_ON_COMMENT_SEEN); U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> RXf_PMf_STD_PMMOD_SHIFT); const char *fptr = STD_PAT_MODS; /*"msix"*/ @@ -6573,13 +6595,13 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* setup various meta data about recursion, this all requires * RExC_npar to be correctly set, and a bit later on we clear it */ - if (RExC_seen & REG_SEEN_RECURSE) { + if (RExC_seen & REG_RECURSE_SEEN) { Newxz(RExC_open_parens, RExC_npar,regnode *); SAVEFREEPV(RExC_open_parens); Newxz(RExC_close_parens,RExC_npar,regnode *); SAVEFREEPV(RExC_close_parens); } - if (RExC_seen & (REG_SEEN_RECURSE | REG_SEEN_GOSTART)) { + if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) { /* Note, RExC_npar is 1 + the number of parens in a pattern. * So its 1 if there are no parens. */ RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) + @@ -6643,10 +6665,10 @@ reStudy: DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); RExC_state = copyRExC_state; - if (seen & REG_TOP_LEVEL_BRANCHES) - RExC_seen |= REG_TOP_LEVEL_BRANCHES; + if (seen & REG_TOP_LEVEL_BRANCHES_SEEN) + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; else - RExC_seen &= ~REG_TOP_LEVEL_BRANCHES; + RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN; StructCopy(&zero_scan_data, &data, scan_data_t); } #else @@ -6666,7 +6688,7 @@ reStudy: /* testing for BRANCH here tells us whether there is "must appear" data in the pattern. If there is then we can use it for optimisations */ - if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. + if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice. */ SSize_t fake; STRLEN longest_float_length, longest_fixed_length; @@ -6844,9 +6866,11 @@ reStudy: if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) && data.last_start_min == 0 && data.last_end > 0 && !RExC_seen_zerolen - && !(RExC_seen & REG_SEEN_VERBARG) - && !((RExC_seen & REG_SEEN_GPOS) || (r->intflags & PREGf_ANCH_GPOS))) + && !(RExC_seen & REG_VERBARG_SEEN) + && !(RExC_seen & REG_GPOS_SEEN) + ){ r->extflags |= RXf_CHECK_ALL; + } scan_commit(pRExC_state, &data,&minlen,0); longest_float_length = CHR_SVLEN(data.longest_float); @@ -6920,7 +6944,7 @@ reStudy: ri->regstclass = (regnode*)RExC_rxi->data->data[n]; r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); - regprop(r, sv, (regnode*)data.start_class); + regprop(r, sv, (regnode*)data.start_class, NULL); PerlIO_printf(Perl_debug_log, "synthetic stclass \"%s\".\n", SvPVX_const(sv));}); @@ -6930,14 +6954,16 @@ reStudy: /* A temporary algorithm prefers floated substr to fixed one to dig * more info. */ if (longest_fixed_length > longest_float_length) { + r->substrs->check_ix = 0; r->check_end_shift = r->anchored_end_shift; r->check_substr = r->anchored_substr; r->check_utf8 = r->anchored_utf8; r->check_offset_min = r->check_offset_max = r->anchored_offset; - if (r->intflags & PREGf_ANCH_SINGLE) + if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)) r->intflags |= PREGf_NOSCAN; } else { + r->substrs->check_ix = 1; r->check_end_shift = r->float_end_shift; r->check_substr = r->float_substr; r->check_utf8 = r->float_utf8; @@ -6949,6 +6975,8 @@ reStudy: if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) r->extflags |= RXf_INTUIT_TAIL; } + r->substrs->data[0].max_offset = r->substrs->data[0].min_offset; + /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) if ( (STRLEN)minlen < longest_float_length ) minlen= longest_float_length; @@ -6996,7 +7024,7 @@ reStudy: ri->regstclass = (regnode*)RExC_rxi->data->data[n]; r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); - regprop(r, sv, (regnode*)data.start_class); + regprop(r, sv, (regnode*)data.start_class, NULL); PerlIO_printf(Perl_debug_log, "synthetic stclass \"%s\".\n", SvPVX_const(sv));}); @@ -7004,31 +7032,39 @@ reStudy: } } + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) { + r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN; + r->maxlen = REG_INFTY; + } + else { + r->maxlen = RExC_maxlen; + } + /* Guard against an embedded (?=) or (?<=) with a longer minlen than the "real" pattern. */ DEBUG_OPTIMISE_r({ - PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n", - (IV)minlen, (IV)r->minlen); + PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n", + (IV)minlen, (IV)r->minlen, RExC_maxlen); }); r->minlenret = minlen; if (r->minlen < minlen) r->minlen = minlen; - if (RExC_seen & REG_SEEN_GPOS) + if (RExC_seen & REG_GPOS_SEEN) r->intflags |= PREGf_GPOS_SEEN; - if (RExC_seen & REG_SEEN_LOOKBEHIND) + if (RExC_seen & REG_LOOKBEHIND_SEEN) r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */ if (pRExC_state->num_code_blocks) r->extflags |= RXf_EVAL_SEEN; - if (RExC_seen & REG_SEEN_CANY) + if (RExC_seen & REG_CANY_SEEN) r->intflags |= PREGf_CANY_SEEN; - if (RExC_seen & REG_SEEN_VERBARG) + if (RExC_seen & REG_VERBARG_SEEN) { r->intflags |= PREGf_VERBARG_SEEN; r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */ } - if (RExC_seen & REG_SEEN_CUTGROUP) + if (RExC_seen & REG_CUTGROUP_SEEN) r->intflags |= PREGf_CUTGROUP_SEEN; if (pm_flags & PMf_USE_RE_EVAL) r->intflags |= PREGf_USE_RE_EVAL; @@ -7042,7 +7078,15 @@ reStudy: if (r->intflags & PREGf_ANCH) r->extflags |= RXf_IS_ANCHORED; + { + /* this is used to identify "special" patterns that might result + * in Perl NOT calling the regex engine and instead doing the match "itself", + * particularly special cases in split//. By having the regex compiler + * do this pattern matching at a regop level (instead of by inspecting the pattern) + * we avoid weird issues with equivalent patterns resulting in different behavior, + * AND we allow non Perl engines to get the same optimizations by the setting the + * flags appropriately - Yves */ regnode *first = ri->program + 1; U8 fop = OP(first); regnode *next = NEXTOPER(first); @@ -7064,6 +7108,11 @@ reStudy: r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); } + + if (RExC_contains_locale) { + RXp_EXTFLAGS(r) |= RXf_TAINTED; + } + #ifdef DEBUGGING if (RExC_paren_names) { ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a")); @@ -7291,7 +7340,7 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) } else if (flags & RXapif_ONE) { ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); av = MUTABLE_AV(SvRV(ret)); - length = av_len(av); + length = av_tindex(av); SvREFCNT_dec_NN(ret); return newSViv(length + 1); } else { @@ -7405,7 +7454,7 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, assert(s >= rx->subbeg); assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) ); if (i >= 0) { -#if NO_TAINT_SUPPORT +#ifdef NO_TAINT_SUPPORT sv_setpvn(sv, s, i); #else const int oldtainted = TAINT_get; @@ -9151,7 +9200,6 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) } cs = REGEX_LOCALE_CHARSET; has_charset_modifier = LOCALE_PAT_MOD; - RExC_contains_locale = 1; break; case UNICODE_PAT_MOD: if (has_charset_modifier) { @@ -9424,7 +9472,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* [19:06] :: is then */ if ( memEQs(start_verb,verb_len,"THEN") ) { op = CUTGROUP; - RExC_seen |= REG_SEEN_CUTGROUP; + RExC_seen |= REG_CUTGROUP_SEEN; } break; } @@ -9457,7 +9505,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } } if (!internal_argval) - RExC_seen |= REG_SEEN_VERBARG; + RExC_seen |= REG_VERBARG_SEEN; } else if ( start_arg ) { vFAIL3("Verb pattern '%.*s' may not have an argument", verb_len, start_verb); @@ -9610,7 +9658,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) paren = 1; goto capturing_parens; } - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; RExC_in_lookbehind++; RExC_parse++; case '=': /* (?=...) */ @@ -9654,7 +9702,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (*RExC_parse != ')') FAIL("Sequence (?R) not terminated"); ret = reg_node(pRExC_state, GOSTART); - RExC_seen |= REG_SEEN_GOSTART; + RExC_seen |= REG_GOSTART_SEEN; *flagp |= POSTPONED; nextchar(pRExC_state); return ret; @@ -9735,7 +9783,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else { RExC_size++; } - RExC_seen |= REG_SEEN_RECURSE; + RExC_seen |= REG_RECURSE_SEEN; Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ Set_Node_Offset(ret, parse_start); /* MJD */ @@ -9995,7 +10043,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY ){ if (!RExC_nestroot) RExC_nestroot = parno; - if (RExC_seen & REG_SEEN_RECURSE + if (RExC_seen & REG_RECURSE_SEEN && !RExC_open_parens[parno-1]) { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, @@ -10086,7 +10134,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) break; case 1: case 2: ender = reganode(pRExC_state, CLOSE, parno); - if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) { + if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, "Setting close paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ender))); @@ -10118,8 +10166,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV * const mysv_val1=sv_newmortal(); SV * const mysv_val2=sv_newmortal(); DEBUG_PARSE_MSG("lsbr"); - regprop(RExC_rx, mysv_val1, lastbr); - regprop(RExC_rx, mysv_val2, ender); + regprop(RExC_rx, mysv_val1, lastbr, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", SvPV_nolen_const(mysv_val1), (IV)REG_NODE_NUM(lastbr), @@ -10133,7 +10181,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (have_branch && !SIZE_ONLY) { char is_nothing= 1; if (depth==1) - RExC_seen |= REG_TOP_LEVEL_BRANCHES; + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; /* Hook the tails of the branches to the closing node. */ for (br = ret; br; br = regnext(br)) { @@ -10159,8 +10207,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV * const mysv_val1=sv_newmortal(); SV * const mysv_val2=sv_newmortal(); DEBUG_PARSE_MSG("NADA"); - regprop(RExC_rx, mysv_val1, ret); - regprop(RExC_rx, mysv_val2, ender); + regprop(RExC_rx, mysv_val1, ret, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", SvPV_nolen_const(mysv_val1), (IV)REG_NODE_NUM(ret), @@ -10456,6 +10504,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ARG1_SET(ret, (U16)min); ARG2_SET(ret, (U16)max); } + if (max == REG_INFTY) + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; goto nest_check; } @@ -10493,6 +10543,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reginsert(pRExC_state, STAR, ret, depth+1); ret->flags = 0; RExC_naughty += 4; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; } else if (op == '*') { min = 0; @@ -10502,6 +10553,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reginsert(pRExC_state, PLUS, ret, depth+1); ret->flags = 0; RExC_naughty += 3; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; } else if (op == '+') { min = 1; @@ -10868,7 +10920,8 @@ S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state) PERL_STATIC_INLINE void S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, - regnode *node, I32* flagp, STRLEN len, UV code_point) + regnode *node, I32* flagp, STRLEN len, UV code_point, + bool downgradable) { /* This knows the details about sizing an EXACTish node, setting flags for * it (by setting <*flagp>, and potentially populating it with a single @@ -10890,13 +10943,24 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, * * It knows that under FOLD, the Latin Sharp S and UTF characters above * 255, must be folded (the former only when the rules indicate it can - * match 'ss') */ + * match 'ss') + * + * When it does the populating, it looks at the flag 'downgradable'. If + * true with a node that folds, it checks if the single code point + * participates in a fold, and if not downgrades the node to an EXACT. + * This helps the optimizer */ bool len_passed_in = cBOOL(len != 0); U8 character[UTF8_MAXBYTES_CASE+1]; PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; + /* Don't bother to check for downgrading in PASS1, as it doesn't make any + * sizing difference, and is extra work that is thrown away */ + if (downgradable && ! PASS2) { + downgradable = FALSE; + } + if (! len_passed_in) { if (UTF) { if (UNI_IS_INVARIANT(code_point)) { @@ -10908,18 +10972,31 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, EBCDIC, but it works there, as the extra invariants fold to themselves) */ *character = toFOLD((U8) code_point); + if (downgradable + && *character == code_point + && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point)) + { + OP(node) = EXACT; + } } len = 1; } else if (FOLD && (! LOC || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point))) { /* Folding, and ok to do so now */ - _to_uni_fold_flags(code_point, + UV folded = _to_uni_fold_flags( + code_point, character, &len, FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) ? FOLD_FLAGS_NOMIX_ASCII : 0)); + if (downgradable + && folded == code_point + && ! _invlist_contains_cp(PL_utf8_foldable, code_point)) + { + OP(node) = EXACT; + } } else if (code_point <= MAX_UTF8_TWO_BYTE) { @@ -10932,19 +11009,36 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, uvchr_to_utf8( character, code_point); len = UTF8SKIP(character); } - } /* Else pattern isn't UTF8. We only fold the sharp s, when - appropriate */ - else if (UNLIKELY(code_point == LATIN_SMALL_LETTER_SHARP_S) - && FOLD - && AT_LEAST_UNI_SEMANTICS - && ! ASCII_FOLD_RESTRICTED) - { + } /* Else pattern isn't UTF8. */ + else if (! FOLD) { + *character = (U8) code_point; + len = 1; + } /* Else is folded non-UTF8 */ + else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { + + /* We don't fold any non-UTF8 except possibly the Sharp s (see + * comments at join_exact()); */ + *character = (U8) code_point; + len = 1; + + /* Can turn into an EXACT node if we know the fold at compile time, + * and it folds to itself and doesn't particpate in other folds */ + if (downgradable + && ! LOC + && PL_fold_latin1[code_point] == code_point + && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point) + || (isASCII(code_point) && ASCII_FOLD_RESTRICTED))) + { + OP(node) = EXACT; + } + } /* else is Sharp s. May need to fold it */ + else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) { *character = 's'; *(character + 1) = 's'; len = 2; } else { - *character = (U8) code_point; + *character = LATIN_SMALL_LETTER_SHARP_S; len = 1; } } @@ -10970,6 +11064,11 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, { *flagp |= SIMPLE; } + + /* The OP may not be well defined in PASS1 */ + if (PASS2 && OP(node) == EXACTFL) { + RExC_contains_locale = 1; + } } @@ -10981,7 +11080,7 @@ S_backref_value(char *p) { char *q = p; - for (;isDIGIT(*q); q++); /* calculate length of num */ + for (;isDIGIT(*q); q++) {} /* calculate length of num */ if (q - p == 0 || q - p > 9) return I32_MAX; return atoi(p); @@ -11194,7 +11293,7 @@ tryagain: goto finish_meta_pat; case 'G': ret = reg_node(pRExC_state, GPOS); - RExC_seen |= REG_SEEN_GPOS; + RExC_seen |= REG_GPOS_SEEN; *flagp |= SIMPLE; goto finish_meta_pat; case 'K': @@ -11205,7 +11304,7 @@ tryagain: * be necessary here to avoid cases of memory corruption, as * with: C<$_="x" x 80; s/x\K/y/> -- rgs */ - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; goto finish_meta_pat; case 'Z': ret = reg_node(pRExC_state, SEOL); @@ -11219,7 +11318,7 @@ tryagain: goto finish_meta_pat; case 'C': ret = reg_node(pRExC_state, CANY); - RExC_seen |= REG_SEEN_CANY; + RExC_seen |= REG_CANY_SEEN; *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'X': @@ -11236,30 +11335,38 @@ tryagain: case 'b': RExC_seen_zerolen++; - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; op = BOUND + get_regex_charset(RExC_flags); if (op > BOUNDA) { /* /aa is same as /a */ op = BOUNDA; } + else if (op == BOUNDL) { + RExC_contains_locale = 1; + } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { - ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead"); + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\b\\{\" instead of \"\\b{\""); } goto finish_meta_pat; case 'B': RExC_seen_zerolen++; - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; op = NBOUND + get_regex_charset(RExC_flags); if (op > NBOUNDA) { /* /aa is same as /a */ op = NBOUNDA; } + else if (op == NBOUNDL) { + RExC_contains_locale = 1; + } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { - ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead"); + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\B\\{\" instead of \"\\B{\""); } goto finish_meta_pat; @@ -11303,6 +11410,9 @@ tryagain: if (op > POSIXA) { /* /aa is same as /a */ op = POSIXA; } + else if (op == POSIXL) { + RExC_contains_locale = 1; + } join_posix_op_known: @@ -11462,11 +11572,15 @@ tryagain: } else { num = S_backref_value(RExC_parse); - /* bare \NNN might be backref or octal */ + /* bare \NNN might be backref or octal - if it is larger than or equal + * RExC_npar then it is assumed to be and octal escape. + * Note RExC_npar is +1 from the actual number of parens*/ if (num == I32_MAX || (num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9')) + { /* Probably a character specified in octal, e.g. \35 */ goto defchar; + } } /* at this point RExC_parse definitely points to a backref @@ -11753,7 +11867,7 @@ tryagain: } case 'c': p++; - ender = grok_bslash_c(*p++, UTF, SIZE_ONLY); + ender = grok_bslash_c(*p++, SIZE_ONLY); break; case '8': case '9': /* must be a backreference */ --p; @@ -11772,7 +11886,11 @@ tryagain: * 118 OR as "\11" . "8" depending on whether there * were 118 capture buffers defined already in the * pattern. */ - if ( !isDIGIT(p[1]) || S_backref_value(p) <= RExC_npar) + + /* NOTE, RExC_npar is 1 more than the actual number of + * parens we have seen so far, hence the < RExC_npar below. */ + + if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar) { /* Not to be treated as an octal constant, go find backref */ --p; @@ -11829,7 +11947,7 @@ tryagain: if (! SIZE_ONLY && RExC_flags & RXf_PMf_EXTENDED && ckWARN_d(WARN_DEPRECATED) - && is_PATWS_non_low(p, UTF)) + && is_PATWS_non_low_safe(p, RExC_end, UTF)) { vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1), "Escape literal pattern white space under /x"); @@ -12181,7 +12299,12 @@ tryagain: OP(ret) = EXACTFU; } } - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender, + FALSE /* Don't look to see if could + be turned into an EXACT + node, as we have already + computed that */ + ); } RExC_parse = p - 1; @@ -12220,7 +12343,7 @@ S_regwhite( RExC_state_t *pRExC_state, char *p ) } } while (p < e); if (!ended) - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; } else break; @@ -12234,7 +12357,7 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) /* Returns the next non-pattern-white space, non-comment character (the * latter only if 'recognize_comment is true) in the string p, which is * ended by RExC_end. If there is no line break ending a comment, - * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */ + * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */ const char *e = RExC_end; PERL_ARGS_ASSERT_REGPATWS; @@ -12254,7 +12377,7 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) } } while (p < e); if (!ended) - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; } else break; @@ -12289,6 +12412,9 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) if (end == UV_MAX && start <= 256) { ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL; } + else if (end >= 256) { + ANYOF_FLAGS(node) |= ANYOF_UTF8; + } /* Quit if are above what we should change */ if (start > 255) { @@ -13123,6 +13249,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * that fold to/from them under /i */ SV* cp_foldable_list = NULL; + /* Like cp_list, but code points on this list are valid only when the + * runtime locale is UTF-8 */ + SV* only_utf8_locale_list = NULL; + #ifdef EBCDIC /* In a range, counts how many 0-2 of the ends of it came from literals, * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ @@ -13157,9 +13287,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, ANYOF_FLAGS(ret) = 0; RExC_emit += ANYOF_SKIP; - if (LOC) { - ANYOF_FLAGS(ret) |= ANYOF_LOCALE; - } listsv = newSVpvs_flags("# comment\n", SVs_TEMP); initial_listsv_len = SvCUR(listsv); SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ @@ -13316,12 +13443,12 @@ parseit: e = strchr(RExC_parse++, '}'); if (!e) vFAIL2("Missing right brace on \\%c{}", c); - while (isSPACE(UCHARAT(RExC_parse))) + while (isSPACE(*RExC_parse)) RExC_parse++; if (e == RExC_parse) vFAIL2("Empty \\%c{}", c); n = e - RExC_parse; - while (isSPACE(UCHARAT(RExC_parse + n - 1))) + while (isSPACE(*(RExC_parse + n - 1))) n--; } else { @@ -13341,7 +13468,7 @@ parseit: * that bit) */ value ^= 'P' ^ 'p'; - while (isSPACE(UCHARAT(RExC_parse))) { + while (isSPACE(*RExC_parse)) { RExC_parse++; n--; } @@ -13496,7 +13623,7 @@ parseit: goto recode_encoding; break; case 'c': - value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY); + value = grok_bslash_c(*RExC_parse++, SIZE_ONLY); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': @@ -13568,47 +13695,6 @@ parseit: /* Here, we have the current token in 'value' */ - /* What matches in a locale is not known until runtime. This includes - * what the Posix classes (like \w, [:space:]) match. Room must be - * reserved (one time per outer bracketed class) to store such classes, - * either if Perl is compiled so that locale nodes always should have - * this space, or if there is such posix class info to be stored. The - * space will contain a bit for each named class that is to be matched - * against. This isn't needed for \p{} and pseudo-classes, as they are - * not affected by locale, and hence are dealt with separately */ - if (LOC) { - if (FOLD && ! need_class) { - need_class = 1; - if (SIZE_ONLY) { - RExC_size += ANYOF_POSIXL_FOLD_SKIP - ANYOF_SKIP; - } - else { - RExC_emit += ANYOF_POSIXL_FOLD_SKIP - ANYOF_SKIP; - } - - /* We need to initialize this here because this node type has - * this field, and will skip getting initialized when we get to - * a posix class since are doing it here */ - ANYOF_POSIXL_ZERO(ret); - } - if (ANYOF_LOCALE == ANYOF_POSIXL - || (namedclass > OOB_NAMEDCLASS - && namedclass < ANYOF_POSIXL_MAX)) - { - if (! need_class) { - need_class = 1; - if (SIZE_ONLY) { - RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP; - } - else { - RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP; - } - ANYOF_POSIXL_ZERO(ret); - } - ANYOF_FLAGS(ret) |= ANYOF_POSIXL; - } - } - if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ U8 classnum; @@ -13647,10 +13733,25 @@ parseit: #ifndef HAS_ISASCII && classnum != _CC_ASCII #endif -#ifndef HAS_ISBLANK - && classnum != _CC_BLANK -#endif ) { + /* What the Posix classes (like \w, [:space:]) match in locale + * isn't knowable under locale until actual match time. Room + * must be reserved (one time per outer bracketed class) to + * store such classes. The space will contain a bit for each + * named class that is to be matched against. This isn't + * needed for \p{} and pseudo-classes, as they are not affected + * by locale, and hence are dealt with separately */ + if (! need_class) { + need_class = 1; + if (SIZE_ONLY) { + RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + else { + RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + ANYOF_FLAGS(ret) |= ANYOF_POSIXL; + ANYOF_POSIXL_ZERO(ret); + } /* See if it already matches the complement of this POSIX * class */ @@ -13733,16 +13834,6 @@ parseit: ? &posixes : &nposixes; SV** source_ptr = &PL_XPosix_ptrs[classnum]; -#ifndef HAS_ISBLANK - /* If the platform doesn't have isblank(), we handle locale - * with the hardcoded ASII values. */ - if (LOC && classnum == _CC_BLANK) { - _invlist_subtract(*source_ptr, - PL_UpperLatin1, - source_ptr); - } -#endif - _invlist_union_maybe_complement_2nd( *posixes_ptr, *source_ptr, @@ -13977,7 +14068,7 @@ parseit: #endif /* Look at the longest folds first */ - for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) { + for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) { if (av_exists(multi_char_matches, cp_count)) { AV** this_array_ptr; @@ -14110,14 +14201,6 @@ parseit: if (op > POSIXA) { /* /aa is same as /a */ op = POSIXA; } -#ifndef HAS_ISBLANK - if (op == POSIXL - && (namedclass == ANYOF_BLANK - || namedclass == ANYOF_NBLANK)) - { - op = POSIXA; - } -#endif join_posix: /* The odd numbered ones are the complements of the @@ -14179,6 +14262,9 @@ parseit: else { RExC_emit = (regnode *)orig_emit; if (PL_regkind[op] == POSIXD) { + if (op == POSIXL) { + RExC_contains_locale = 1; + } if (invert) { op += NPOSIXD - POSIXD; } @@ -14194,7 +14280,9 @@ parseit: *flagp |= HASWIDTH|SIMPLE; } else if (PL_regkind[op] == EXACT) { - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); } RExC_parse = (char *) cur_parse; @@ -14225,8 +14313,7 @@ parseit: * runtime only when the locale indicates Unicode rules. For * non-locale, we just use to the general list */ if (LOC) { - use_list = &ANYOF_UTF8_LOCALE_INVLIST(ret); - *use_list = NULL; + use_list = &only_utf8_locale_list; } else { use_list = &cp_list; @@ -14390,7 +14477,7 @@ parseit: { AV* list = (AV*) *listp; IV k; - for (k = 0; k <= av_len(list); k++) { + for (k = 0; k <= av_tindex(list); k++) { SV** c_p = av_fetch(list, k, FALSE); UV c; if (c_p == NULL) { @@ -14557,7 +14644,7 @@ parseit: * fetching). We know to set the flag if we have a non-NULL list for UTF-8 * locales, or the class matches at least one 0-255 range code point */ if (LOC && FOLD) { - if (ANYOF_UTF8_LOCALE_INVLIST(ret)) { + if (only_utf8_locale_list) { ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; } else if (cp_list) { /* Look to see if there a 0-255 code point is in @@ -14575,8 +14662,9 @@ parseit: * at compile time. Besides not inverting folded locale now, we can't * invert if there are things such as \w, which aren't known until runtime * */ - if (invert - && ! (ANYOF_FLAGS(ret) & (ANYOF_LOC_FOLD|ANYOF_POSIXL)) + if (cp_list + && invert + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) && ! depends_list && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) { @@ -14624,13 +14712,13 @@ parseit: if (cp_list && ! invert && ! depends_list - && ! (ANYOF_FLAGS(ret) & (ANYOF_LOC_FOLD|ANYOF_POSIXL)) + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION /* We don't optimize if we are supposed to make sure all non-Unicode * code points raise a warning, as only ANYOF nodes have this check. * */ - && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER)) + && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER)) { UV start, end; U8 op = END; /* The optimzation node-type */ @@ -14723,7 +14811,9 @@ parseit: RExC_parse = (char *)cur_parse; if (PL_regkind[op] == EXACT) { - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); } SvREFCNT_dec_NN(cp_list); @@ -14754,6 +14844,7 @@ parseit: else { cp_list = depends_list; } + ANYOF_FLAGS(ret) |= ANYOF_UTF8; } /* If there is a swash and more than one element, we can't use the swash in @@ -14766,9 +14857,15 @@ parseit: set_ANYOF_arg(pRExC_state, ret, cp_list, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) ? listsv : NULL, + only_utf8_locale_list, swash, has_user_defined_property); *flagp |= HASWIDTH|SIMPLE; + + if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) { + RExC_contains_locale = 1; + } + return ret; } @@ -14779,6 +14876,7 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, regnode* const node, SV* const cp_list, SV* const runtime_defns, + SV* const only_utf8_locale_list, SV* const swash, const bool has_user_defined_property) { @@ -14796,23 +14894,30 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, * av[1] if &PL_sv_undef, is a placeholder to later contain the swash * computed from av[0]. But if no further computation need be done, * the swash is stored here now (and av[0] is &PL_sv_undef). - * av[2] stores the cp_list inversion list for use in addition or instead + * av[2] stores the inversion list of code points that match only if the + * current locale is UTF-8 + * av[3] stores the cp_list inversion list for use in addition or instead * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef. * (Otherwise everything needed is already in av[0] and av[1]) - * av[3] is set if any component of the class is from a user-defined - * property; used only if av[2] exists */ + * av[4] is set if any component of the class is from a user-defined + * property; used only if av[3] exists */ UV n; PERL_ARGS_ASSERT_SET_ANYOF_ARG; - if (! cp_list && ! runtime_defns) { + if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) { + assert(! (ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8))); ARG_SET(node, ANYOF_NONBITMAP_EMPTY); } else { AV * const av = newAV(); SV *rv; + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); + av_store(av, 0, (runtime_defns) ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef); if (swash) { @@ -14822,11 +14927,18 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, else { av_store(av, 1, &PL_sv_undef); if (cp_list) { - av_store(av, 2, cp_list); - av_store(av, 3, newSVuv(has_user_defined_property)); + av_store(av, 3, cp_list); + av_store(av, 4, newSVuv(has_user_defined_property)); } } + if (only_utf8_locale_list) { + av_store(av, 2, only_utf8_locale_list); + } + else { + av_store(av, 2, &PL_sv_undef); + } + rv = newRV_noinc(MUTABLE_SV(av)); n = add_data(pRExC_state, STR_WITH_LEN("s")); RExC_rxi->data->data[n] = (void*)rv; @@ -14839,7 +14951,7 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, Absorbs an /x style # comments from the input stream. Returns true if there is more text remaining in the stream. - Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment + Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment terminates the pattern without including a newline. Note its the callers responsibility to ensure that we are @@ -14862,7 +14974,7 @@ S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) if (!ended) { /* we ran off the end of the pattern without ending the comment, so we have to add an \n when wrapping */ - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; return 0; } else return 1; @@ -15148,7 +15260,7 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, DEBUG_PARSE_r({ SV * const mysv=sv_newmortal(); DEBUG_PARSE_MSG((scan==p ? "tail" : "")); - regprop(RExC_rx, mysv, scan); + regprop(RExC_rx, mysv, scan, NULL); PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), (temp == NULL ? "->" : ""), @@ -15238,7 +15350,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, DEBUG_PARSE_r({ SV * const mysv=sv_newmortal(); DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); - regprop(RExC_rx, mysv, scan); + regprop(RExC_rx, mysv, scan, NULL); PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), @@ -15251,7 +15363,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, DEBUG_PARSE_r({ SV * const mysv_val=sv_newmortal(); DEBUG_PARSE_MSG(""); - regprop(RExC_rx, mysv_val, val); + regprop(RExC_rx, mysv_val, val, NULL); PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n", SvPV_nolen_const(mysv_val), @@ -15281,7 +15393,9 @@ S_regdump_intflags(pTHX_ const char *lead, const U32 flags) int bit; int set=0; - for (bit=0; bit<32; bit++) { + ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8); + + for (bit=0; bitregstclass) { - regprop(r, sv, ri->regstclass); + regprop(r, sv, ri->regstclass, NULL); PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); } if (r->intflags & PREGf_ANCH) { @@ -15440,11 +15556,11 @@ Perl_regdump(pTHX_ const regexp *r) } /* -- regprop - printable representation of opcode +- regprop - printable representation of opcode, with run time support */ void -Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) +Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo) { #ifdef DEBUGGING dVAR; @@ -15546,7 +15662,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) (UV)trie->maxlen, (UV)TRIE_CHARCOUNT(trie), (UV)trie->uniquecharcount - ) + ); ); if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { sv_catpvs(sv, "["); @@ -15589,6 +15705,20 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } } } + if ( k == REF && reginfo) { + U32 n = ARG(o); /* which paren pair */ + I32 ln = prog->offs[n].start; + if (prog->lastparen < n || ln == -1) + Perl_sv_catpvf(aTHX_ sv, ": FAIL"); + else if (ln == prog->offs[n].end) + Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); + else { + const char *s = reginfo->strbeg + ln; + Perl_sv_catpvf(aTHX_ sv, ": "); + Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0, + PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); + } + } } else if (k == GOSUB) /* Paren and offset */ Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); @@ -15604,7 +15734,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) int do_sep = 0; - if (flags & ANYOF_LOCALE) + if (flags & ANYOF_LOCALE_FLAGS) sv_catpvs(sv, "{loc}"); if (flags & ANYOF_LOC_FOLD) sv_catpvs(sv, "{i}"); @@ -15627,8 +15757,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } } - if ((flags & ANYOF_ABOVE_LATIN1_ALL) - || ANYOF_UTF8_LOCALE_INVLIST(o) || ANYOF_NONBITMAP(o)) + if ((flags & (ANYOF_ABOVE_LATIN1_ALL + |ANYOF_UTF8 + |ANYOF_NONBITMAP_NON_UTF8 + |ANYOF_LOC_FOLD))) { if (do_sep) { Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); @@ -15644,13 +15776,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) /* output information about the unicode matching */ if (flags & ANYOF_ABOVE_LATIN1_ALL) sv_catpvs(sv, "{unicode_all}"); - else if (ANYOF_NONBITMAP(o)) { + else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) { SV *lv; /* Set if there is something outside the bit map. */ bool byte_output = FALSE; /* If something in the bitmap has been output */ + SV *only_utf8_locale; /* Get the stuff that wasn't in the bitmap */ - (void) regclass_swash(prog, o, FALSE, &lv, NULL); + (void) _get_regclass_nonbitmap_data(prog, o, FALSE, + &lv, &only_utf8_locale); if (lv && lv != &PL_sv_undef) { char *s = savesvpv(lv); char * const origs = s; @@ -15701,25 +15835,27 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Safefree(origs); SvREFCNT_dec_NN(lv); } - } - /* Output any UTF-8 locale code points */ - if (flags & ANYOF_LOC_FOLD && ANYOF_UTF8_LOCALE_INVLIST(o)) { - UV start, end; - int max_entries = 256; - - sv_catpvs(sv, "{utf8 locale}"); - invlist_iterinit(ANYOF_UTF8_LOCALE_INVLIST(o)); - while (invlist_iternext(ANYOF_UTF8_LOCALE_INVLIST(o), - &start, &end)) { - put_range(sv, start, end); - max_entries --; - if (max_entries < 0) { - sv_catpvs(sv, "..."); - break; + if ((flags & ANYOF_LOC_FOLD) + && only_utf8_locale + && only_utf8_locale != &PL_sv_undef) + { + UV start, end; + int max_entries = 256; + + sv_catpvs(sv, "{utf8 locale}"); + invlist_iterinit(only_utf8_locale); + while (invlist_iternext(only_utf8_locale, + &start, &end)) { + put_range(sv, start, end); + max_entries --; + if (max_entries < 0) { + sv_catpvs(sv, "..."); + break; + } } + invlist_iterfinish(only_utf8_locale); } - invlist_iterfinish(ANYOF_UTF8_LOCALE_INVLIST(o)); } } @@ -15727,10 +15863,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } else if (k == POSIXD || k == NPOSIXD) { U8 index = FLAGS(o) * 2; - if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) { - Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); - } - else { + if (index < C_ARRAY_LENGTH(anyofs)) { if (*anyofs[index] != '[') { sv_catpv(sv, "["); } @@ -15739,6 +15872,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) sv_catpv(sv, "]"); } } + else { + Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); + } } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); @@ -15747,9 +15883,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) PERL_UNUSED_ARG(sv); PERL_UNUSED_ARG(o); PERL_UNUSED_ARG(prog); + PERL_UNUSED_ARG(reginfo); #endif /* DEBUGGING */ } + + SV * Perl_re_intuit_string(pTHX_ REGEXP * const r) { /* Assume that RE_INTUIT is set */ @@ -16487,7 +16626,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } else CLEAR_OPTSTART; - regprop(r, sv, node); + regprop(r, sv, node, NULL); PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), (int)(2*indent + 1), "", SvPVX_const(sv)); @@ -16584,11 +16723,9 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } else if (PL_regkind[(U8)op] == ANYOF) { /* arglen 1 + class block */ - node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) - ? ANYOF_POSIXL_FOLD_SKIP - : (ANYOF_FLAGS(node) & ANYOF_POSIXL) - ? ANYOF_POSIXL_SKIP - : ANYOF_SKIP); + node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL) + ? ANYOF_POSIXL_SKIP + : ANYOF_SKIP); node = NEXTOPER(node); } else if (PL_regkind[(U8)op] == EXACT) {