X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/1505a47c967e97cd85fc9d764bd8736468ad8a95..f6555ff3f309865a88085cd93354cd93bfb96fd3:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 9bf05c0..e7c6662 100644 --- a/regcomp.c +++ b/regcomp.c @@ -93,6 +93,8 @@ EXTERN_C const struct regexp_engine my_reg_engine; #define HAS_NONLATIN1_FOLD_CLOSURE(i) \ _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \ + _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) @@ -159,7 +161,8 @@ struct RExC_state_t { within pattern */ int num_code_blocks; /* size of code_blocks[] */ int code_index; /* next code_blocks[] slot */ -#if ADD_TO_REGEXEC + SSize_t maxlen; /* mininum possible number of chars in string to match */ +#ifdef ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) #endif @@ -196,6 +199,7 @@ struct RExC_state_t { #define RExC_sawback (pRExC_state->sawback) #define RExC_seen (pRExC_state->seen) #define RExC_size (pRExC_state->size) +#define RExC_maxlen (pRExC_state->maxlen) #define RExC_npar (pRExC_state->npar) #define RExC_nestroot (pRExC_state->nestroot) #define RExC_extralen (pRExC_state->extralen) @@ -741,38 +745,41 @@ static const scan_data_t zero_scan_data = DEBUG_OPTIMISE_MORE_r({ \ PerlIO_printf(Perl_debug_log,"RExC_seen: "); \ \ - if (RExC_seen & REG_SEEN_ZERO_LEN) \ - PerlIO_printf(Perl_debug_log,"REG_SEEN_ZERO_LEN "); \ + if (RExC_seen & REG_ZERO_LEN_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \ \ - if (RExC_seen & REG_SEEN_LOOKBEHIND) \ - PerlIO_printf(Perl_debug_log,"REG_SEEN_LOOKBEHIND "); \ + if (RExC_seen & REG_LOOKBEHIND_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \ \ - if (RExC_seen & REG_SEEN_GPOS) \ - PerlIO_printf(Perl_debug_log,"REG_SEEN_GPOS "); \ + if (RExC_seen & REG_GPOS_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \ \ - if (RExC_seen & REG_SEEN_CANY) \ - PerlIO_printf(Perl_debug_log,"REG_SEEN_CANY "); \ + if (RExC_seen & REG_CANY_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \ \ - if (RExC_seen & REG_SEEN_RECURSE) \ - PerlIO_printf(Perl_debug_log,"REG_SEEN_RECURSE "); \ + if (RExC_seen & REG_RECURSE_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \ \ - if (RExC_seen & REG_TOP_LEVEL_BRANCHES) \ - PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES "); \ + if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \ \ - if (RExC_seen & REG_SEEN_VERBARG) \ - PerlIO_printf(Perl_debug_log,"REG_SEEN_VERBARG "); \ + if (RExC_seen & REG_VERBARG_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \ \ - if (RExC_seen & REG_SEEN_CUTGROUP) \ - PerlIO_printf(Perl_debug_log,"REG_SEEN_CUTGROUP "); \ + if (RExC_seen & REG_CUTGROUP_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \ \ - if (RExC_seen & REG_SEEN_RUN_ON_COMMENT) \ - PerlIO_printf(Perl_debug_log,"REG_SEEN_RUN_ON_COMMENT "); \ + if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \ \ - if (RExC_seen & REG_SEEN_EXACTF_SHARP_S) \ - PerlIO_printf(Perl_debug_log,"REG_SEEN_EXACTF_SHARP_S "); \ + if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \ \ - if (RExC_seen & REG_SEEN_GOSTART) \ - PerlIO_printf(Perl_debug_log,"REG_SEEN_GOSTART "); \ + if (RExC_seen & REG_GOSTART_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \ + \ + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \ \ PerlIO_printf(Perl_debug_log,"\n"); \ }); @@ -919,7 +926,7 @@ S_ssc_is_anything(pTHX_ const regnode_ssc *ssc) } /* If e.g., both \w and \W are set, matches everything */ - if (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) { + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { int i; for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) { if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) { @@ -953,10 +960,6 @@ S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) * necessary. */ if (RExC_contains_locale) { ANYOF_POSIXL_SETALL(ssc); - ANYOF_FLAGS(ssc) |= ANYOF_LOCALE|ANYOF_POSIXL; - if (RExC_contains_i) { - ANYOF_FLAGS(ssc) |= ANYOF_LOC_FOLD; - } } else { ANYOF_POSIXL_ZERO(ssc); @@ -989,16 +992,8 @@ S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state, return FALSE; } - if (RExC_contains_locale) { - if (! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) - || ! (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) - || ! ANYOF_POSIXL_TEST_ALL_SET(ssc)) - { - return FALSE; - } - if (RExC_contains_i && ! (ANYOF_FLAGS(ssc) & ANYOF_LOC_FOLD)) { - return FALSE; - } + if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) { + return FALSE; } return TRUE; @@ -1006,16 +1001,19 @@ S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state, STATIC SV* S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, - const regnode_charclass_posixl* const node) + const regnode_charclass* const node) { /* Returns a mortal inversion list defining which code points are matched * by 'node', which is of type ANYOF. Handles complementing the result if * appropriate. If some code points aren't knowable at this time, the - * returned list must, and will, contain every possible code point. */ + * returned list must, and will, contain every code point that is a + * possibility. */ SV* invlist = sv_2mortal(_new_invlist(0)); + SV* only_utf8_locale_invlist = NULL; unsigned int i; const U32 n = ARG(node); + bool new_node_has_latin1 = FALSE; PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; @@ -1035,11 +1033,18 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, * known until runtime -- we have to assume it could be anything */ return _add_range_to_invlist(invlist, 0, UV_MAX); } - else { + else if (ary[3] && ary[3] != &PL_sv_undef) { /* Here no compile-time swash, and no run-time only data. Use the * node's inversion list */ - invlist = sv_2mortal(invlist_clone(ary[2])); + invlist = sv_2mortal(invlist_clone(ary[3])); + } + + /* Get the code points valid only under UTF-8 locales */ + if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) + && ary[2] && ary[2] != &PL_sv_undef) + { + only_utf8_locale_invlist = ary[2]; } } @@ -1062,6 +1067,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, for (i = 0; i < 256; i++) { if (ANYOF_BITMAP_TEST(node, i)) { invlist = add_cp_to_invlist(invlist, i); + new_node_has_latin1 = TRUE; } } @@ -1079,6 +1085,22 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, if (ANYOF_FLAGS(node) & ANYOF_INVERT) { _invlist_invert(invlist); } + else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) { + + /* Under /li, any 0-255 could fold to any other 0-255, depending on the + * locale. We can skip this if there are no 0-255 at all. */ + _invlist_union(invlist, PL_Latin1, &invlist); + } + + /* Similarly add the UTF-8 locale possible matches. These have to be + * deferred until after the non-UTF-8 locale ones are taken care of just + * above, or it leads to wrong results under ANYOF_INVERT */ + if (only_utf8_locale_invlist) { + _invlist_union_maybe_complement_2nd(invlist, + only_utf8_locale_invlist, + ANYOF_FLAGS(node) & ANYOF_INVERT, + &invlist); + } return invlist; } @@ -1089,30 +1111,13 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) -STATIC void -S_ssc_flags_and(regnode_ssc *ssc, const U8 and_with) -{ - /* Take the flags 'and_with' and accumulate them anded into the flags for - * the SSC 'ssc'. The non-SSC related flags in 'and_with' are ignored. - * The flags 'and_with' should not come from another SSC (otherwise the - * EMPTY_STRING flag won't work) */ - - const U8 ssc_only_flags = ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS; - - PERL_ARGS_ASSERT_SSC_FLAGS_AND; - - /* Use just the SSC-related flags from 'and_with' */ - ANYOF_FLAGS(ssc) &= (and_with & ANYOF_COMMON_FLAGS); - ANYOF_FLAGS(ssc) |= ssc_only_flags; -} - /* 'AND' a given class with another one. Can create false positives. 'ssc' * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if * 'and_with' is a regnode_charclass instead of a regnode_ssc. */ STATIC void S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, - const regnode_ssc *and_with) + const regnode_charclass *and_with) { /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either * another SSC or a regular ANYOF class. Can create false positives. */ @@ -1127,7 +1132,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract * the code point inversion list and just the relevant flags */ if (is_ANYOF_SYNTHETIC(and_with)) { - anded_cp_list = and_with->invlist; + anded_cp_list = ((regnode_ssc *)and_with)->invlist; anded_flags = ANYOF_FLAGS(and_with); /* XXX This is a kludge around what appears to be deficiencies in the @@ -1145,13 +1150,12 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, * creates bugs, the consequences are only that a warning isn't raised * that should be; while the consequences for having /l bugs is * incorrect matches */ - if (ssc_is_anything(and_with)) { + if (ssc_is_anything((regnode_ssc *)and_with)) { anded_flags |= ANYOF_WARN_SUPER; } } else { - anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, - (regnode_charclass_posixl*) and_with); + anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; } @@ -1203,7 +1207,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) { ANYOF_POSIXL_ZERO(ssc); } - else if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) { + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { /* Note that the Posix class component P from 'and_with' actually * looks like: @@ -1234,10 +1238,10 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, ANYOF_POSIXL_ZERO(&temp); for (i = 0; i < ANYOF_MAX; i++) { assert(i % 2 != 0 - || ! ANYOF_POSIXL_TEST(and_with, i) - || ! ANYOF_POSIXL_TEST(and_with, i + 1)); + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i) + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1)); - if (ANYOF_POSIXL_TEST(and_with, i)) { + if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) { ANYOF_POSIXL_SET(&temp, i + add); } add = 0 - add; /* 1 goes to -1; -1 goes to 1 */ @@ -1248,7 +1252,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC in its initial state */ else if (! is_ANYOF_SYNTHETIC(and_with) - || ! ssc_is_cp_posixl_init(pRExC_state, and_with)) + || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with)) { /* But if 'ssc' is in its initial state, the result is just 'and_with'; * copy it over 'ssc' */ @@ -1260,15 +1264,17 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, ssc->invlist = anded_cp_list; ANYOF_POSIXL_ZERO(ssc); if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { - ANYOF_POSIXL_OR(and_with, ssc); + ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); } } } - else if ((ANYOF_FLAGS(ssc) & ANYOF_POSIXL) - || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) + || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) { /* One or the other of P1, P2 is non-empty. */ - ANYOF_POSIXL_AND(and_with, ssc); + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); + } ssc_union(ssc, anded_cp_list, FALSE); } else { /* P1 = P2 = empty */ @@ -1279,7 +1285,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, STATIC void S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, - const regnode_ssc *or_with) + const regnode_charclass *or_with) { /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either * another SSC or a regular ANYOF class. Can create false positives if @@ -1295,12 +1301,11 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract * the code point inversion list and just the relevant flags */ if (is_ANYOF_SYNTHETIC(or_with)) { - ored_cp_list = or_with->invlist; + ored_cp_list = ((regnode_ssc*) or_with)->invlist; ored_flags = ANYOF_FLAGS(or_with); } else { - ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, - (regnode_charclass_posixl*) or_with); + ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; } @@ -1328,10 +1333,10 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, && ! is_ANYOF_SYNTHETIC(or_with)) { /* We ignore P2, leaving P1 going forward */ - } - else { /* Not inverted */ - ANYOF_POSIXL_OR(or_with, ssc); - if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) { + } /* else Not inverted */ + else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { unsigned int i; for (i = 0; i < ANYOF_MAX; i += 2) { if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1)) @@ -1339,9 +1344,6 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, ssc_match_all_cp(ssc); ANYOF_POSIXL_CLEAR(ssc, i); ANYOF_POSIXL_CLEAR(ssc, i+1); - if (! ANYOF_POSIXL_TEST_ANY_SET(ssc)) { - ANYOF_FLAGS(ssc) &= ~ANYOF_POSIXL; - } } } } @@ -1442,9 +1444,17 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); - set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE); + set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, + NULL, NULL, NULL, FALSE); + + /* Make sure is clone-safe */ + ssc->invlist = NULL; + + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + ANYOF_FLAGS(ssc) |= ANYOF_POSIXL; + } - assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) || RExC_contains_locale); + assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); } #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] @@ -1710,7 +1720,7 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, May be the same as tail. tail : item following the branch sequence count : words in the sequence - flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/ + flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/ depth : indent depth Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. @@ -1999,6 +2009,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, }); re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); + assert(re_trie_maxbuff); if (!SvIOK(re_trie_maxbuff)) { sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); } @@ -2046,10 +2057,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *noper = NEXTOPER( cur ); const U8 *uc = (U8*)STRING( noper ); const U8 *e = uc + STR_LEN( noper ); - STRLEN foldlen = 0; + int foldlen = 0; U32 wordlen = 0; /* required init */ - STRLEN minbytes = 0; - STRLEN maxbytes = 0; + STRLEN minchars = 0; + STRLEN maxchars = 0; bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/ @@ -2074,58 +2085,74 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); } } - for ( ; uc < e ; uc += len ) { + for ( ; uc < e ; uc += len ) { /* Look at each char in the current + branch */ TRIE_CHARCOUNT(trie)++; TRIE_READ_CHAR; - /* Acummulate to the current values, the range in the number of - * bytes that this character could match. The max is presumed to - * be the same as the folded input (which TRIE_READ_CHAR returns), - * except that when this is not in UTF-8, it could be matched - * against a string which is UTF-8, and the variant characters - * could be 2 bytes instead of the 1 here. Likewise, for the - * minimum number of bytes when not folded. When folding, the min - * is assumed to be 1 byte could fold to match the single character - * here, or in the case of a multi-char fold, 1 byte can fold to - * the whole sequence. 'foldlen' is used to denote whether we are - * in such a sequence, skipping the min setting if so. XXX TODO - * Use the exact list of what folds to each character, from - * PL_utf8_foldclosures */ - if (UTF) { - maxbytes += UTF8SKIP(uc); - if (! folder) { - /* A non-UTF-8 string could be 1 byte to match our 2 */ - minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc)) - ? 1 - : UTF8SKIP(uc); - } - else { - if (foldlen) { - foldlen -= UTF8SKIP(uc); - } - else { - foldlen = is_MULTI_CHAR_FOLD_utf8(uc); - minbytes++; - } - } + /* TRIE_READ_CHAR returns the current character, or its fold if /i + * is in effect. Under /i, this character can match itself, or + * anything that folds to it. If not under /i, it can match just + * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN + * all fold to k, and all are single characters. But some folds + * expand to more than one character, so for example LATIN SMALL + * LIGATURE FFI folds to the three character sequence 'ffi'. If + * the string beginning at 'uc' is 'ffi', it could be matched by + * three characters, or just by the one ligature character. (It + * could also be matched by two characters: LATIN SMALL LIGATURE FF + * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI). + * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also + * match.) The trie needs to know the minimum and maximum number + * of characters that could match so that it can use size alone to + * quickly reject many match attempts. The max is simple: it is + * the number of folded characters in this branch (since a fold is + * never shorter than what folds to it. */ + + maxchars++; + + /* And the min is equal to the max if not under /i (indicated by + * 'folder' being NULL), or there are no multi-character folds. If + * there is a multi-character fold, the min is incremented just + * once, for the character that folds to the sequence. Each + * character in the sequence needs to be added to the list below of + * characters in the trie, but we count only the first towards the + * min number of characters needed. This is done through the + * variable 'foldlen', which is returned by the macros that look + * for these sequences as the number of bytes the sequence + * occupies. Each time through the loop, we decrement 'foldlen' by + * how many bytes the current char occupies. Only when it reaches + * 0 do we increment 'minchars' or look for another multi-character + * sequence. */ + if (folder == NULL) { + minchars++; + } + else if (foldlen > 0) { + foldlen -= (UTF) ? UTF8SKIP(uc) : 1; } else { - maxbytes += (UNI_IS_INVARIANT(*uc)) - ? 1 - : 2; - if (! folder) { - minbytes++; - } - else { - if (foldlen) { - foldlen--; - } - else { - foldlen = is_MULTI_CHAR_FOLD_latin1(uc); - minbytes++; + minchars++; + + /* See if *uc is the beginning of a multi-character fold. If + * so, we decrement the length remaining to look at, to account + * for the current character this iteration. (We can use 'uc' + * instead of the fold returned by TRIE_READ_CHAR because for + * non-UTF, the latin1_safe macro is smart enough to account + * for all the unfolded characters, and because for UTF, the + * string will already have been folded earlier in the + * compilation process */ + if (UTF) { + if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { + foldlen -= UTF8SKIP(uc); } } + else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) { + foldlen--; + } } + + /* The current character (and any potential folds) should be added + * to the possible matching characters for this position in this + * branch */ if ( uvc < 256 ) { if ( folder ) { U8 folded= folder[ (U8) uvc ]; @@ -2156,6 +2183,14 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, set_bit = 0; /* We've done our bit :-) */ } } else { + + /* XXX We could come up with the list of code points that fold + * to this using PL_utf8_foldclosures, except not for + * multi-char folds, as there may be multiple combinations + * there that could work, which needs to wait until runtime to + * resolve (The comment about LIGATURE FFI above is such an + * example */ + SV** svpp; if ( !widecharmap ) widecharmap = newHV(); @@ -2170,14 +2205,17 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, TRIE_STORE_REVCHAR(uvc); } } - } + } /* end loop through characters in this branch of the trie */ + + /* We take the min and max for this branch and combine to find the min + * and max for all branches processed so far */ if( cur == first ) { - trie->minlen = minbytes; - trie->maxlen = maxbytes; - } else if (minbytes < trie->minlen) { - trie->minlen = minbytes; - } else if (maxbytes > trie->maxlen) { - trie->maxlen = maxbytes; + trie->minlen = minchars; + trie->maxlen = maxchars; + } else if (minchars < trie->minlen) { + trie->minlen = minchars; + } else if (maxchars > trie->maxlen) { + trie->maxlen = maxchars; } } /* end first pass */ DEBUG_TRIE_COMPILE_r( @@ -2946,8 +2984,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, : MADE_TRIE; } -STATIC void -S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth) +STATIC regnode * +S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth) { /* The Trie is constructed and compressed now so we can build a fail array if * it's needed @@ -2985,13 +3023,26 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode U32 *fail; reg_ac_data *aho; const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T")); + regnode *stclass; GET_RE_DEBUG_FLAGS_DECL; - PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE; + PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif + if ( OP(source) == TRIE ) { + struct regnode_1 *op = (struct regnode_1 *) + PerlMemShared_calloc(1, sizeof(struct regnode_1)); + StructCopy(source,op,struct regnode_1); + stclass = (regnode *)op; + } else { + struct regnode_charclass *op = (struct regnode_charclass *) + PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); + StructCopy(source,op,struct regnode_charclass); + stclass = (regnode *)op; + } + OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */ ARG_SET( stclass, data_slot ); aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); @@ -3057,7 +3108,8 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode PerlIO_printf(Perl_debug_log, "\n"); }); Safefree(q); - /*RExC_seen |= REG_SEEN_TRIEDFA;*/ + /*RExC_seen |= REG_TRIEDFA_SEEN;*/ + return stclass; } @@ -3065,7 +3117,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode DEBUG_OPTIMISE_r({if (scan){ \ SV * const mysv=sv_newmortal(); \ regnode *Next = regnext(scan); \ - regprop(RExC_rx, mysv, scan); \ + regprop(RExC_rx, mysv, scan, NULL); \ PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \ (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\ Next ? (REG_NODE_NUM(Next)) : 0 ); \ @@ -3084,10 +3136,18 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * * If a node is to match under /i (folded), the number of characters it matches * can be different than its character length if it contains a multi-character - * fold. *min_subtract is set to the total delta of the input nodes. + * fold. *min_subtract is set to the total delta number of characters of the + * input nodes. * - * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF - * and contains LATIN SMALL LETTER SHARP S + * And *unfolded_multi_char is set to indicate whether or not the node contains + * an unfolded multi-char fold. This happens when whether the fold is valid or + * not won't be known until runtime; namely for EXACTF nodes that contain LATIN + * SMALL LETTER SHARP S, as only if the target string being matched against + * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose + * folding rules depend on the locale in force at runtime. (Multi-char folds + * whose components are all above the Latin1 range are not run-time locale + * dependent, and have already been folded by the time this function is + * called.) * * This is as good a place as any to discuss the design of handling these * multi-character fold sequences. It's been wrong in Perl for a very long @@ -3109,11 +3169,12 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * character folded sequences. Since a single character can fold into * such a sequence, the minimum match length for this node is less than * the number of characters in the node. This routine returns in - * *min_subtract how much to subtract from the the actual length of the - * string to get a real minimum match length; it is 0 if there are no - * multi-char foldeds. This delta is used by the caller to adjust the min - * length of the match, and the delta between min and max, so that the - * optimizer doesn't reject these possibilities based on size constraints. + * *min_subtract how many characters to subtract from the the actual + * length of the string to get a real minimum match length; it is 0 if + * there are no multi-char foldeds. This delta is used by the caller to + * adjust the min length of the match, and the delta between min and max, + * so that the optimizer doesn't reject these possibilities based on size + * constraints. * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS * is used for an EXACTFU node that contains at least one "ss" sequence in * it. For non-UTF-8 patterns and strings, this is the only case where @@ -3121,11 +3182,12 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * EXACTFU node without UTF-8 involvement doesn't have to concern itself * with length changes, and so can be processed faster. regexec.c takes * advantage of this. Generally, an EXACTFish node that is in UTF-8 is - * pre-folded by regcomp.c. This saves effort in regex matching. - * However, the pre-folding isn't done for non-UTF8 patterns because the - * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things - * down by forcing the pattern into UTF8 unless necessary. Also what - * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold + * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't + * known until runtime). This saves effort in regex matching. However, + * the pre-folding isn't done for non-UTF8 patterns because the fold of + * the MICRO SIGN requires UTF-8, and we don't want to slow things down by + * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and, + * again, EXACTFL) nodes fold to isn't known until runtime. The fold * possibilities for the non-UTF8 patterns are quite simple, except for * the sharp s. All the ones that don't involve a UTF-8 target string are * members of a fold-pair, and arrays are set up for all of them so that @@ -3133,37 +3195,45 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * this file makes sure that in EXACTFU nodes, the sharp s gets folded to * 'ss', even if the pattern isn't UTF-8. This avoids the issues * described in the next item. - * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the - * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a - * UTF-8 pattern.) An assumption that the optimizer part of regexec.c - * (probably unwittingly, in Perl_regexec_flags()) makes is that a - * character in the pattern corresponds to at most a single character in - * the target string. (And I do mean character, and not byte here, unlike - * other parts of the documentation that have never been updated to - * account for multibyte Unicode.) sharp s in EXACTF nodes can match the - * two character string 'ss'; in EXACTFA nodes it can match - * "\x{17F}\x{17F}". These violate the assumption, and they are the only - * instances where it is violated. I'm reluctant to try to change the - * assumption, as the code involved is impenetrable to me (khw), so - * instead the code here punts. This routine examines (when the pattern - * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a - * boolean indicating whether or not the node contains a sharp s. When it - * is true, the caller sets a flag that later causes the optimizer in this - * file to not set values for the floating and fixed string lengths, and - * thus avoids the optimizer code in regexec.c that makes the invalid + * 3) A problem remains for unfolded multi-char folds. (These occur when the + * validity of the fold won't be known until runtime, and so must remain + * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA + * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot + * be an EXACTF node with a UTF-8 pattern.) They also occur for various + * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.) + * The reason this is a problem is that the optimizer part of regexec.c + * (probably unwittingly, in Perl_regexec_flags()) makes an assumption + * that a character in the pattern corresponds to at most a single + * character in the target string. (And I do mean character, and not byte + * here, unlike other parts of the documentation that have never been + * updated to account for multibyte Unicode.) sharp s in EXACTF and + * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes + * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL + * nodes, violate the assumption, and they are the only instances where it + * is violated. I'm reluctant to try to change the assumption, as the + * code involved is impenetrable to me (khw), so instead the code here + * punts. This routine examines EXACTFL nodes, and (when the pattern + * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a + * boolean indicating whether or not the node contains such a fold. When + * it is true, the caller sets a flag that later causes the optimizer in + * this file to not set values for the floating and fixed string lengths, + * and thus avoids the optimizer code in regexec.c that makes the invalid * assumption. Thus, there is no optimization based on string lengths for - * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s. - * (The reason the assumption is wrong only in these two cases is that all - * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all - * other folds to their expanded versions. We can't prefold sharp s to - * 'ss' in EXACTF nodes because we don't know at compile time if it - * actually matches 'ss' or not. It will match iff the target string is - * in UTF-8, unlike the EXACTFU nodes, where it always matches; and - * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8 - * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem; - * but in a non-UTF8 pattern, folding it to that above-Latin1 string would - * require the pattern to be forced into UTF-8, the overhead of which we - * want to avoid.) + * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern + * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the + * assumption is wrong only in these cases is that all other non-UTF-8 + * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to + * their expanded versions. (Again, we can't prefold sharp s to 'ss' in + * EXACTF nodes because we don't know at compile time if it actually + * matches 'ss' or not. For EXACTF nodes it will match iff the target + * string is in UTF-8. This is in contrast to EXACTFU nodes, where it + * always matches; and EXACTFA where it never does. In an EXACTFA node in + * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the + * problem; but in a non-UTF8 pattern, folding it to that above-Latin1 + * string would require the pattern to be forced into UTF-8, the overhead + * of which we want to avoid. Similarly the unfolded multi-char folds in + * EXACTFL nodes will match iff the locale at the time of match is a UTF-8 + * locale.) * * Similarly, the code that generates tries doesn't currently handle * not-already-folded multi-char folds, and it looks like a pain to change @@ -3173,13 +3243,13 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * using /iaa matching will be doing so almost entirely with ASCII * strings, so this should rarely be encountered in practice */ -#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \ +#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \ if (PL_regkind[OP(scan)] == EXACT) \ - join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1) + join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1) STATIC U32 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, - UV *min_subtract, bool *has_exactf_sharp_s, + UV *min_subtract, bool *unfolded_multi_char, U32 flags,regnode *val, U32 depth) { /* Merge several consecutive EXACTish nodes into one. */ @@ -3227,8 +3297,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, const unsigned int oldl = STR_LEN(scan); regnode * const nnext = regnext(n); - /* XXX I (khw) kind of doubt that this works on platforms where - * U8_MAX is above 255 because of lots of other assumptions */ + /* XXX I (khw) kind of doubt that this works on platforms (should + * Perl ever run on one) where U8_MAX is above 255 because of lots + * of other assumptions */ /* Don't join if the sum can't fit into a single node */ if (oldl + STR_LEN(n) > U8_MAX) break; @@ -3263,7 +3334,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, } *min_subtract = 0; - *has_exactf_sharp_s = FALSE; + *unfolded_multi_char = FALSE; /* Here, all the adjacent mergeable EXACTish nodes have been merged. We * can now analyze for sequences of problematic code points. (Prior to @@ -3271,15 +3342,68 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, * hence missed). The sequences only happen in folding, hence for any * non-EXACT EXACTish node */ if (OP(scan) != EXACT) { - const U8 * const s0 = (U8*) STRING(scan); - const U8 * s = s0; - const U8 * const s_end = s0 + STR_LEN(scan); + U8* s0 = (U8*) STRING(scan); + U8* s = s0; + U8* s_end = s0 + STR_LEN(scan); + + int total_count_delta = 0; /* Total delta number of characters that + multi-char folds expand to */ /* One pass is made over the node's string looking for all the - * possibilities. to avoid some tests in the loop, there are two main + * possibilities. To avoid some tests in the loop, there are two main * cases, for UTF-8 patterns (which can't have EXACTF nodes) and * non-UTF-8 */ if (UTF) { + U8* folded = NULL; + + if (OP(scan) == EXACTFL) { + U8 *d; + + /* An EXACTFL node would already have been changed to another + * node type unless there is at least one character in it that + * is problematic; likely a character whose fold definition + * won't be known until runtime, and so has yet to be folded. + * For all but the UTF-8 locale, folds are 1-1 in length, but + * to handle the UTF-8 case, we need to create a temporary + * folded copy using UTF-8 locale rules in order to analyze it. + * This is because our macros that look to see if a sequence is + * a multi-char fold assume everything is folded (otherwise the + * tests in those macros would be too complicated and slow). + * Note that here, the non-problematic folds will have already + * been done, so we can just copy such characters. We actually + * don't completely fold the EXACTFL string. We skip the + * unfolded multi-char folds, as that would just create work + * below to figure out the size they already are */ + + Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8); + d = folded; + while (s < s_end) { + STRLEN s_len = UTF8SKIP(s); + if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) { + Copy(s, d, s_len, U8); + d += s_len; + } + else if (is_FOLDS_TO_MULTI_utf8(s)) { + *unfolded_multi_char = TRUE; + Copy(s, d, s_len, U8); + d += s_len; + } + else if (isASCII(*s)) { + *(d++) = toFOLD(*s); + } + else { + STRLEN len; + _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL); + d += len; + } + s += s_len; + } + + /* Point the remainder of the routine to look at our temporary + * folded copy */ + s = folded; + s_end = d; + } /* End of creating folded copy of EXACTFL string */ /* Examine the string for a multi-character fold sequence. UTF-8 * patterns have all characters pre-folded by the time this code is @@ -3288,40 +3412,31 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, length sequence we are looking for is 2 */ { int count = 0; /* How many characters in a multi-char fold */ - int len = is_MULTI_CHAR_FOLD_utf8(s); + int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); if (! len) { /* Not a multi-char fold: get next char */ s += UTF8SKIP(s); continue; } - /* Nodes with 'ss' require special handling, except for EXACTFL - * and EXACTFA-ish for which there is no multi-char fold to - * this */ + /* Nodes with 'ss' require special handling, except for + * EXACTFA-ish for which there is no multi-char fold to this */ if (len == 2 && *s == 's' && *(s+1) == 's' - && OP(scan) != EXACTFL && OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) { count = 2; - OP(scan) = EXACTFU_SS; + if (OP(scan) != EXACTFL) { + OP(scan) = EXACTFU_SS; + } s += 2; } else { /* Here is a generic multi-char fold. */ - const U8* multi_end = s + len; + U8* multi_end = s + len; - /* Count how many characters in it. In the case of /l and + /* Count how many characters are in it. In the case of * /aa, no folds which contain ASCII code points are - * allowed, so check for those, and skip if found. (In - * EXACTFL, no folds are allowed to any Latin1 code point, - * not just ASCII. But there aren't any of these - * currently, nor ever likely, so don't take the time to - * test for them. The code that generates the - * is_MULTI_foo() macros croaks should one actually get put - * into Unicode .) */ - if (OP(scan) != EXACTFL - && OP(scan) != EXACTFA - && OP(scan) != EXACTFA_NO_TRIE) - { + * allowed, so check for those, and skip if found. */ + if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) { count = utf8_length(s, multi_end); s = multi_end; } @@ -3341,9 +3456,23 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, /* The delta is how long the sequence is minus 1 (1 is how long * the character that folds to the sequence is) */ - *min_subtract += count - 1; - next_iteration: ; + total_count_delta += count - 1; + next_iteration: ; } + + /* We created a temporary folded copy of the string in EXACTFL + * nodes. Therefore we need to be sure it doesn't go below zero, + * as the real string could be shorter */ + if (OP(scan) == EXACTFL) { + int total_chars = utf8_length((U8*) STRING(scan), + (U8*) STRING(scan) + STR_LEN(scan)); + if (total_count_delta > total_chars) { + total_count_delta = total_chars; + } + } + + *min_subtract += total_count_delta; + Safefree(folded); } else if (OP(scan) == EXACTFA) { @@ -3356,32 +3485,32 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, while (s < s_end) { if (*s == LATIN_SMALL_LETTER_SHARP_S) { OP(scan) = EXACTFA_NO_TRIE; - *has_exactf_sharp_s = TRUE; + *unfolded_multi_char = TRUE; break; } s++; continue; } } - else if (OP(scan) != EXACTFL) { - - /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the - * multi-char folds that are all Latin1. (This code knows that - * there are no current multi-char folds possible with EXACTFL, - * relying on fold_grind.t to catch any errors if the very unlikely - * event happens that some get added in future Unicode versions.) - * As explained in the comments preceding this function, we look - * also for the sharp s in EXACTF nodes; it can be in the final - * position. Otherwise we can stop looking 1 byte earlier because - * have to find at least two characters for a multi-fold */ - const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1; + else { + + /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char + * folds that are all Latin1. As explained in the comments + * preceding this function, we look also for the sharp s in EXACTF + * and EXACTFL nodes; it can be in the final position. Otherwise + * we can stop looking 1 byte earlier because have to find at least + * two characters for a multi-fold */ + const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL) + ? s_end + : s_end -1; while (s < upper) { - int len = is_MULTI_CHAR_FOLD_latin1(s); + int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); if (! len) { /* Not a multi-char fold. */ - if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF) + if (*s == LATIN_SMALL_LETTER_SHARP_S + && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) { - *has_exactf_sharp_s = TRUE; + *unfolded_multi_char = TRUE; } s++; continue; @@ -3396,8 +3525,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, * changed so that a sharp s in the string can match this * ss in the pattern, but they remain EXACTF nodes, as they * won't match this unless the target string is is UTF-8, - * which we don't know until runtime */ - if (OP(scan) != EXACTF) { + * which we don't know until runtime. EXACTFL nodes can't + * transform into EXACTFU nodes */ + if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { OP(scan) = EXACTFU_SS; } } @@ -3447,8 +3577,6 @@ typedef struct scan_frame { } scan_frame; -#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) - STATIC SSize_t S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SSize_t *minlenp, SSize_t *deltap, @@ -3498,7 +3626,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, UV min_subtract = 0; /* How mmany chars to subtract from the minimum node length to get a real minimum (because the folded version may be shorter) */ - bool has_exactf_sharp_s = FALSE; + bool unfolded_multi_char = FALSE; /* Peephole optimizer: */ DEBUG_OPTIMISE_MORE_r( { @@ -3527,10 +3655,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, DEBUG_PEEP("Peep", scan, depth); - /* Its not clear to khw or hv why this is done here, and not in the - * clauses that deal with EXACT nodes. khw's guess is that it's - * because of a previous design */ - JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0); + /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/ + * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled + * by a different invocation of reg() -- Yves + */ + JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0); /* Follow the next-chain of the current node and optimize away all the NOTHINGs from it. */ @@ -3574,11 +3703,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, regnode_ssc accum; regnode * const startbranch=scan; - if (flags & SCF_DO_SUBSTR) - SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge - strings after - this. */ - if (flags & SCF_DO_STCLASS) + if (flags & SCF_DO_SUBSTR) { + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } + + if (flags & SCF_DO_STCLASS) ssc_init_zero(pRExC_state, &accum); while (OP(scan) == code) { @@ -3635,7 +3765,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->whilem_c = data_fake.whilem_c; } if (flags & SCF_DO_STCLASS) - ssc_or(pRExC_state, &accum, &this_class); + ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class); } if (code == IFTHEN && num < 2) /* Empty ELSE branch */ min1 = 0; @@ -3655,15 +3785,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - ssc_or(pRExC_state, data->start_class, &accum); + ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum); if (min1) { - ssc_and(pRExC_state, data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } else if (flags & SCF_DO_STCLASS_AND) { if (min1) { - ssc_and(pRExC_state, data->start_class, &accum); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); flags &= ~SCF_DO_STCLASS; } else { @@ -3677,8 +3807,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } - if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) - == BRANCH ) + if (PERL_ENABLE_TRIE_OPTIMISATION && + OP( startbranch ) == BRANCH ) { /* demq. @@ -3755,7 +3885,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, tail ); + regprop(RExC_rx, mysv, tail, NULL); PerlIO_printf( Perl_debug_log, "%*s%s%s\n", (int)depth * 2 + 2, "", "Looking for TRIE'able sequences. Tail node is: ", @@ -3836,16 +3966,16 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); - regprop(RExC_rx, mysv, noper); + regprop(RExC_rx, mysv, noper, NULL); PerlIO_printf( Perl_debug_log, " -> %s", SvPV_nolen_const(mysv)); if ( noper_next ) { - regprop(RExC_rx, mysv, noper_next ); + regprop(RExC_rx, mysv, noper_next, NULL); PerlIO_printf( Perl_debug_log,"\t=> %s\t", SvPV_nolen_const(mysv)); } @@ -3944,7 +4074,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } /* end handle unmergable node */ } /* loop over branches */ DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, "%*s- %s (%d) \n", (int)depth * 2 + 2, @@ -3968,7 +4098,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( startbranch == first && scan == tail ) { - RExC_seen &=~REG_TOP_LEVEL_BRANCHES; + RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; } } #endif @@ -3984,7 +4114,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * something like this: (?:|) So we can * turn it into a plain NOTHING op. */ DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, "%*s- %s (%d) \n", (int)depth * 2 + 2, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); @@ -4047,7 +4177,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* some form of infinite recursion, assume infinite length * */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; @@ -4128,31 +4258,32 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else if (flags & SCF_DO_STCLASS_OR) { ssc_add_cp(data->start_class, uc); - ssc_and(pRExC_state, data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } flags &= ~SCF_DO_STCLASS; } - else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */ + else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is + EXACTFish */ SSize_t l = STR_LEN(scan); UV uc = *((U8*)STRING(scan)); SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2 separate code points */ + const U8 * s = (U8*)STRING(scan); /* Search for fixed substrings supports EXACT only. */ if (flags & SCF_DO_SUBSTR) { assert(data); - SCAN_COMMIT(pRExC_state, data, minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); } if (UTF) { - const U8 * const s = (U8 *)STRING(scan); uc = utf8_to_uvchr_buf(s, s + l, NULL); l = utf8_length(s, s + l); } - if (has_exactf_sharp_s) { - RExC_seen |= REG_SEEN_EXACTF_SHARP_S; + if (unfolded_multi_char) { + RExC_seen |= REG_UNFOLDED_MULTI_SEEN; } min += l - min_subtract; assert (min >= 0); @@ -4167,79 +4298,140 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->longest = &(data->longest_float); } } - if (OP(scan) == EXACTFL) { - if (flags & SCF_DO_STCLASS_AND) { - ssc_flags_and(data->start_class, - ANYOF_LOCALE|ANYOF_LOC_FOLD); - } - else if (flags & SCF_DO_STCLASS_OR) { - ANYOF_FLAGS(data->start_class) - |= ANYOF_LOCALE|ANYOF_LOC_FOLD; - } - /* We don't know what the folds are; it could be anything. XXX - * Actually, we only support UTF-8 encoding for code points - * above Latin1, so we could know what those folds are. */ - EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist, - 0, - UV_MAX); + if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) { + ssc_clear_locale(data->start_class); } - else { /* Non-locale EXACTFish */ - EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); - if (flags & SCF_DO_STCLASS_AND) { - ssc_clear_locale(data->start_class); + + if (! UTF) { + + /* We punt and assume can match anything if the node begins + * with a multi-character fold. Things are complicated. For + * example, /ffi/i could match any of: + * "\N{LATIN SMALL LIGATURE FFI}" + * "\N{LATIN SMALL LIGATURE FF}I" + * "F\N{LATIN SMALL LIGATURE FI}" + * plus several other things; and making sure we have all the + * possibilities is hard. */ + if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) { + EXACTF_invlist = + _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX); } - if (uc < 256) { /* We know what the Latin1 folds are ... */ - if (IS_IN_SOME_FOLD_L1(uc)) { /* For instance, we - know if anything folds - with this */ - EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, + else { + + /* Any Latin1 range character can potentially match any + * other depending on the locale */ + if (OP(scan) == EXACTFL) { + _invlist_union(EXACTF_invlist, PL_Latin1, + &EXACTF_invlist); + } + else { + /* But otherwise, it matches at least itself. We can + * quickly tell if it has a distinct fold, and if so, + * it matches that as well */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (IS_IN_SOME_FOLD_L1(uc)) { + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, PL_fold_latin1[uc]); - if (OP(scan) != EXACTFA) { /* The folds below aren't - legal under /iaa */ - if (isARG2_lower_or_UPPER_ARG1('s', uc)) { - EXACTF_invlist - = add_cp_to_invlist(EXACTF_invlist, - LATIN_SMALL_LETTER_SHARP_S); - } - else if (uc == LATIN_SMALL_LETTER_SHARP_S) { - EXACTF_invlist - = add_cp_to_invlist(EXACTF_invlist, 's'); - EXACTF_invlist - = add_cp_to_invlist(EXACTF_invlist, 'S'); - } } + } - /* We also know if there are above-Latin1 code points - * that fold to this (none legal for ASCII and /iaa) */ - if ((! isASCII(uc) || OP(scan) != EXACTFA) - && HAS_NONLATIN1_FOLD_CLOSURE(uc)) - { - /* XXX We could know exactly what does fold to this - * if the reverse folds are loaded, as currently in - * S_regclass() */ - _invlist_union(EXACTF_invlist, - PL_AboveLatin1, - &EXACTF_invlist); + /* Some characters match above-Latin1 ones under /i. This + * is true of EXACTFL ones when the locale is UTF-8 */ + if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc) + && (! isASCII(uc) || (OP(scan) != EXACTFA + && OP(scan) != EXACTFA_NO_TRIE))) + { + add_above_Latin1_folds(pRExC_state, + (U8) uc, + &EXACTF_invlist); + } + } + } + else { /* Pattern is UTF-8 */ + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; + STRLEN foldlen = UTF8SKIP(s); + const U8* e = s + STR_LEN(scan); + SV** listp; + + /* The only code points that aren't folded in a UTF EXACTFish + * node are are the problematic ones in EXACTFL nodes */ + if (OP(scan) == EXACTFL + && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) + { + /* We need to check for the possibility that this EXACTFL + * node begins with a multi-char fold. Therefore we fold + * the first few characters of it so that we can make that + * check */ + U8 *d = folded; + int i; + + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD(*s); + s++; + } + else { + STRLEN len; + to_utf8_fold(s, d, &len); + d += len; + s += UTF8SKIP(s); } } + + /* And set up so the code below that looks in this folded + * buffer instead of the node's string */ + e = d; + foldlen = UTF8SKIP(folded); + s = folded; + } + + /* When we reach here 's' points to the fold of the first + * character(s) of the node; and 'e' points to far enough along + * the folded string to be just past any possible multi-char + * fold. 'foldlen' is the length in bytes of the first + * character in 's' + * + * Unlike the non-UTF-8 case, the macro for determining if a + * string is a multi-char fold requires all the characters to + * already be folded. This is because of all the complications + * if not. Note that they are folded anyway, except in EXACTFL + * nodes. Like the non-UTF case above, we punt if the node + * begins with a multi-char fold */ + + if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) { + EXACTF_invlist = + _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX); } - else { /* Non-locale, above Latin1. XXX We don't currently - know what participates in folds with this, so have - to assume anything could */ - - /* XXX We could know exactly what does fold to this if the - * reverse folds are loaded, as currently in S_regclass(). - * But we do know that under /iaa nothing in the ASCII - * range can participate */ - if (OP(scan) == EXACTFA) { - _invlist_union_complement_2nd(EXACTF_invlist, - PL_XPosix_ptrs[_CC_ASCII], - &EXACTF_invlist); + else { /* Single char fold */ + + /* It matches all the things that fold to it, which are + * found in PL_utf8_foldclosures (including itself) */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (! PL_utf8_foldclosures) { + _load_PL_utf8_foldclosures(); } - else { - EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist, - 0, UV_MAX); + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) s, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_tindex(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + assert(c_p); + + c = SvUV(*c_p); + + /* /aa doesn't allow folds between ASCII and non- */ + if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE) + && isASCII(c) != isASCII(uc)) + { + continue; + } + + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c); + } } } } @@ -4250,7 +4442,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else if (flags & SCF_DO_STCLASS_OR) { ssc_union(data->start_class, EXACTF_invlist, FALSE); - ssc_and(pRExC_state, data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; @@ -4284,7 +4476,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_SUBSTR) data->pos_min++; min++; - /* Fall through. */ + /* FALLTHROUGH */ case STAR: if (flags & SCF_DO_STCLASS) { mincount = 0; @@ -4293,13 +4485,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, scan = NEXTOPER(scan); goto do_curly; } - is_inf = is_inf_internal = 1; - scan = regnext(scan); if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state, data, minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); /* Cannot extend fixed substrings */ data->longest = &(data->longest_float); } + is_inf = is_inf_internal = 1; + scan = regnext(scan); goto optimize_curly_tail; case CURLY: if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) @@ -4320,7 +4512,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, next_is_eval = (OP(scan) == EVAL); do_curly: if (flags & SCF_DO_SUBSTR) { - if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); + if (mincount == 0) + scan_commit(pRExC_state, data, minlenp, is_inf); /* Cannot extend fixed substrings */ pos_before = data->pos_min; } @@ -4361,7 +4554,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->start_class = oclass; if (mincount == 0 || minnext == 0) { if (flags & SCF_DO_STCLASS_OR) { - ssc_or(pRExC_state, data->start_class, &this_class); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); } else if (flags & SCF_DO_STCLASS_AND) { /* Switch to OR mode: cache the old value of @@ -4375,11 +4568,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } else { /* Non-zero len */ if (flags & SCF_DO_STCLASS_OR) { - ssc_or(pRExC_state, data->start_class, &this_class); - ssc_and(pRExC_state, data->start_class, and_withp); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); } else if (flags & SCF_DO_STCLASS_AND) - ssc_and(pRExC_state, data->start_class, &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class); flags &= ~SCF_DO_STCLASS; } if (!scan) /* It was not CURLYX, but CURLY. */ @@ -4403,12 +4596,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, is_inf_internal |= deltanext == SSize_t_MAX || (maxcount == REG_INFTY && minnext + deltanext > 0); is_inf |= is_inf_internal; - if (is_inf) + if (is_inf) { delta = SSize_t_MAX; - else + } else { delta += (minnext + deltanext) * maxcount - minnext * mincount; - + } /* Try powerful optimization CURLYX => CURLYN. */ if ( OP(oscan) == CURLYX && data && data->flags & SF_IN_PAR @@ -4459,7 +4652,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && !(data->flags & SF_HAS_EVAL) && !deltanext /* atom is fixed width */ && minnext != 0 /* CURLYM can't handle zero width */ - && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */ + + /* Nor characters whose fold at run-time may be + * multi-character */ + && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN) ) { /* XXXX How to optimize if data == 0? */ /* Optimize to a simpler form. */ @@ -4531,6 +4727,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, pars++; if (flags & SCF_DO_SUBSTR) { SV *last_str = NULL; + STRLEN last_chrs = 0; int counted = mincount != 0; if (data->last_end > 0 && mincount != 0) { /* Ends with a @@ -4546,9 +4743,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, l -= old; /* Get the added string: */ last_str = newSVpvn_utf8(s + old, l, UTF); + last_chrs = UTF ? utf8_length((U8*)(s + old), + (U8*)(s + old + l)) : l; if (deltanext == 0 && pos_before == b) { /* What was added is a constant string */ if (mincount > 1) { + SvGROW(last_str, (mincount * l) + 1); repeatcpy(SvPVX(last_str) + l, SvPVX_const(last_str), l, @@ -4564,8 +4764,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) - mg->mg_len += CHR_SVLEN(last_str) - l; + mg->mg_len += last_chrs * (mincount-1); } + last_chrs *= mincount; data->last_end += l * (mincount - 1); } } else { @@ -4578,13 +4779,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* It is counted once already... */ data->pos_min += minnext * (mincount - counted); #if 0 -PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf - " SSize_t_MAX=%"UVdf" minnext=%"UVdf - " maxcount=%"UVdf" mincount=%"UVdf"\n", +PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf + " SSize_t_MAX=%"UVuf" minnext=%"UVuf + " maxcount=%"UVuf" mincount=%"UVuf"\n", (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount, (UV)mincount); if (deltanext != SSize_t_MAX) -PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", +PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", (UV)(-counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta)); #endif @@ -4597,7 +4798,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", if (mincount != maxcount) { /* Cannot extend fixed substrings found inside the group. */ - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); if (mincount && last_str) { SV * const sv = data->last_found; MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? @@ -4607,12 +4808,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", mg->mg_len = -1; sv_setsv(sv, last_str); data->last_end = data->pos_min; - data->last_start_min = - data->pos_min - CHR_SVLEN(last_str); + data->last_start_min = data->pos_min - last_chrs; data->last_start_max = is_inf ? SSize_t_MAX - : data->pos_min + data->pos_delta - - CHR_SVLEN(last_str); + : data->pos_min + data->pos_delta - last_chrs; } data->longest = &(data->longest_float); } @@ -4636,8 +4835,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", case REF: case CLUMP: if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect - anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; @@ -4667,7 +4866,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", ssc_union(data->start_class, PL_XPosix_ptrs[_CC_VERTSPACE], FALSE); - ssc_and(pRExC_state, data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); /* See commit msg for * 749e076fceedeb708a624933726e7989f2302f6a */ @@ -4678,8 +4877,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", min++; delta++; /* Because of the 2 char string cr-lf */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect - anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min += 1; data->pos_delta += 1; data->longest = &(data->longest_float); @@ -4688,7 +4887,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", else if (REGNODE_SIMPLE(OP(scan))) { if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min++; } min++; @@ -4741,15 +4940,15 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", case ANYOF: if (flags & SCF_DO_STCLASS_AND) ssc_and(pRExC_state, data->start_class, - (regnode_ssc*) scan); + (regnode_charclass *) scan); else ssc_or(pRExC_state, data->start_class, - (regnode_ssc*)scan); + (regnode_charclass *) scan); break; case NPOSIXL: invert = 1; - /* FALL THROUGH */ + /* FALLTHROUGH */ case POSIXL: namedclass = classnum_to_namedclass(FLAGS(scan)) + invert; @@ -4779,16 +4978,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", ssc_match_all_cp(data->start_class); ANYOF_POSIXL_CLEAR(data->start_class, namedclass); ANYOF_POSIXL_CLEAR(data->start_class, complement); - if (! ANYOF_POSIXL_TEST_ANY_SET(data->start_class)) - { - ANYOF_FLAGS(data->start_class) &= ~ANYOF_POSIXL; - } } else { /* The usual case; just add this class to the existing set */ ANYOF_POSIXL_SET(data->start_class, namedclass); - ANYOF_FLAGS(data->start_class) - |= ANYOF_LOCALE|ANYOF_POSIXL; } } break; @@ -4796,7 +4989,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", case NPOSIXA: /* For these, we always know the exact set of what's matched */ invert = 1; - /* FALL THROUGH */ + /* FALLTHROUGH */ case POSIXA: if (FLAGS(scan) == _CC_ASCII) { my_invlist = PL_XPosix_ptrs[_CC_ASCII]; @@ -4811,17 +5004,19 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", case NPOSIXD: case NPOSIXU: invert = 1; - /* FALL THROUGH */ + /* FALLTHROUGH */ case POSIXD: case POSIXU: my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]); /* NPOSIXD matches all upper Latin1 code points unless the * target string being matched is UTF-8, which is - * unknowable until match time */ - if (PL_regkind[OP(scan)] == NPOSIXD) { - _invlist_union_complement_2nd(my_invlist, - PL_XPosix_ptrs[_CC_ASCII], &my_invlist); + * unknowable until match time. Since we are going to + * invert, we want to get rid of all of them so that the + * inversion will match all */ + if (OP(scan) == NPOSIXD) { + _invlist_subtract(my_invlist, PL_UpperLatin1, + &my_invlist); } join_posix: @@ -4836,7 +5031,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", } } if (flags & SCF_DO_STCLASS_OR) - ssc_and(pRExC_state, data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } @@ -4844,7 +5039,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", data->flags |= (OP(scan) == MEOL ? SF_BEFORE_MEOL : SF_BEFORE_SEOL); - SCAN_COMMIT(pRExC_state, data, minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); } else if ( PL_regkind[OP(scan)] == BRANCHJ @@ -4863,7 +5058,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", DEBUG_STUDYDATA("OPFAIL",data,depth); /*DEBUG_PARSE_MSG("opfail");*/ - regprop(RExC_rx, mysv_val, upto); + regprop(RExC_rx, mysv_val, upto, NULL); PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", SvPV_nolen_const(mysv_val), @@ -4939,7 +5134,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", ssc_init(pRExC_state, data->start_class); } else { /* AND before and after: combine and continue */ - ssc_and(pRExC_state, data->start_class, &intrnl); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); } } } @@ -4971,7 +5166,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", if ((flags & SCF_DO_SUBSTR) && data->last_found) { f |= SCF_DO_SUBSTR; if (scan->flags) - SCAN_COMMIT(pRExC_state, &data_fake,minlenp); + scan_commit(pRExC_state, &data_fake, minlenp, is_inf); data_fake.last_found=newSVsv(data->last_found); } } @@ -5010,7 +5205,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", *minnextp += min; if (f & SCF_DO_STCLASS_AND) { - ssc_and(pRExC_state, data->start_class, &intrnl); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); } if (data) { if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) @@ -5021,7 +5216,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { if (RExC_rx->minlen<*minnextp) RExC_rx->minlen=*minnextp; - SCAN_COMMIT(pRExC_state, &data_fake, minnextp); + scan_commit(pRExC_state, &data_fake, minnextp, is_inf); SvREFCNT_dec_NN(data_fake.last_found); if ( data_fake.minlen_fixed != minlenp ) @@ -5065,7 +5260,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", } else if ( PL_regkind[OP(scan)] == ENDLIKE ) { if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); flags &= ~SCF_DO_SUBSTR; } if (data && OP(scan)==ACCEPT) { @@ -5077,7 +5272,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ { if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; @@ -5086,15 +5281,15 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", flags &= ~SCF_DO_STCLASS; } else if (OP(scan) == GPOS) { - if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) && + if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) && !(delta || is_inf || (data && data->pos_delta))) { - if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR)) - RExC_rx->extflags |= RXf_ANCH_GPOS; + if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR)) + RExC_rx->intflags |= PREGf_ANCH_GPOS; if (RExC_rx->gofs < (STRLEN)min) RExC_rx->gofs = min; } else { - RExC_rx->extflags |= RXf_GPOS_FLOAT; + RExC_rx->intflags |= PREGf_GPOS_FLOAT; RExC_rx->gofs = 0; } } @@ -5110,9 +5305,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", SSize_t max1 = 0, min1 = SSize_t_MAX; regnode_ssc accum; - if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ - SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings - after this. */ + if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */ + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } if (flags & SCF_DO_STCLASS) ssc_init_zero(pRExC_state, &accum); @@ -5181,7 +5377,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", data->whilem_c = data_fake.whilem_c; } if (flags & SCF_DO_STCLASS) - ssc_or(pRExC_state, &accum, &this_class); + ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class); } } if (flags & SCF_DO_SUBSTR) { @@ -5193,15 +5389,15 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", min += min1; delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - ssc_or(pRExC_state, data->start_class, &accum); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); if (min1) { - ssc_and(pRExC_state, data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } else if (flags & SCF_DO_STCLASS_AND) { if (min1) { - ssc_and(pRExC_state, data->start_class, &accum); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); flags &= ~SCF_DO_STCLASS; } else { @@ -5226,8 +5422,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", delta += (trie->maxlen - trie->minlen); flags &= ~SCF_DO_STCLASS; /* xxx */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect - anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min += trie->minlen; data->pos_delta += (trie->maxlen - trie->minlen); if (trie->maxlen != trie->minlen) @@ -5270,6 +5466,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", *scanp = scan; *deltap = is_inf_internal ? SSize_t_MAX : delta; + if (flags & SCF_DO_SUBSTR && is_inf) data->pos_delta = SSize_t_MAX - data->pos_min; if (is_par > (I32)U8_MAX) @@ -5283,13 +5480,21 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", data->flags &= ~SF_IN_PAR; } if (flags & SCF_DO_STCLASS_OR) - ssc_and(pRExC_state, data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); if (flags & SCF_TRIE_RESTUDY) data->flags |= SCF_TRIE_RESTUDY; DEBUG_STUDYDATA("post-fin:",data,depth); - return min < stopmin ? min : stopmin; + { + SSize_t final_minlen= min < stopmin ? min : stopmin; + + if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) { + RExC_maxlen = final_minlen + delta; + } + return final_minlen; + } + /* not-reached */ } STATIC U32 @@ -5311,7 +5516,8 @@ S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) return count; } -/*XXX: todo make this not included in a non debugging perl */ +/*XXX: todo make this not included in a non debugging perl, but appears to be + * used anyway there, in 'use re' */ #ifndef PERL_IN_XSUB_RE void Perl_reginitcolors(pTHX) @@ -5514,7 +5720,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, /* if we know we have at least two args, create an empty string, * then concatenate args to that. For no args, return an empty string */ if (!pat && pat_count != 1) { - pat = newSVpvn("", 0); + pat = newSVpvs(""); SAVEFREESV(pat); alloced = TRUE; } @@ -5955,8 +6161,8 @@ S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, || (eol /* Can't have SEOL and MULTI */ && (! meol || (RExC_flags & RXf_PMf_MULTILINE))) ) - /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ - || (RExC_seen & REG_SEEN_EXACTF_SHARP_S)) + /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */ + || (RExC_seen & REG_UNFOLDED_MULTI_SEEN)) { return FALSE; } @@ -6243,10 +6449,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, if (rx_flags & PMf_FOLD) { RExC_contains_i = 1; } - if (initial_charset == REGEX_LOCALE_CHARSET) { - RExC_contains_locale = 1; - } - else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { + if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { /* Set to use unicode semantics if the pattern is in utf8 and has the * 'depends' charset specified, as it means unicode when utf8 */ @@ -6274,6 +6477,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_sawback = 0; RExC_seen = 0; + RExC_maxlen = 0; RExC_in_lookbehind = 0; RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_extralen = 0; @@ -6414,8 +6618,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, bool has_default = (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) || ! has_charset); - bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT) - == REG_SEEN_RUN_ON_COMMENT); + bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN) + == REG_RUN_ON_COMMENT_SEEN); U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> RXf_PMf_STD_PMMOD_SHIFT); const char *fptr = STD_PAT_MODS; /*"msix"*/ @@ -6479,13 +6683,13 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* setup various meta data about recursion, this all requires * RExC_npar to be correctly set, and a bit later on we clear it */ - if (RExC_seen & REG_SEEN_RECURSE) { + if (RExC_seen & REG_RECURSE_SEEN) { Newxz(RExC_open_parens, RExC_npar,regnode *); SAVEFREEPV(RExC_open_parens); Newxz(RExC_close_parens,RExC_npar,regnode *); SAVEFREEPV(RExC_close_parens); } - if (RExC_seen & (REG_SEEN_RECURSE | REG_SEEN_GOSTART)) { + if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) { /* Note, RExC_npar is 1 + the number of parens in a pattern. * So its 1 if there are no parens. */ RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) + @@ -6549,10 +6753,10 @@ reStudy: DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); RExC_state = copyRExC_state; - if (seen & REG_TOP_LEVEL_BRANCHES) - RExC_seen |= REG_TOP_LEVEL_BRANCHES; + if (seen & REG_TOP_LEVEL_BRANCHES_SEEN) + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; else - RExC_seen &= ~REG_TOP_LEVEL_BRANCHES; + RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN; StructCopy(&zero_scan_data, &data, scan_data_t); } #else @@ -6572,7 +6776,7 @@ reStudy: /* testing for BRANCH here tells us whether there is "must appear" data in the pattern. If there is then we can use it for optimisations */ - if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. + if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice. */ SSize_t fake; STRLEN longest_float_length, longest_fixed_length; @@ -6634,22 +6838,8 @@ reStudy: else if (PL_regkind[OP(first)] == TRIE && ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) { - regnode *trie_op; - /* this can happen only on restudy */ - if ( OP(first) == TRIE ) { - struct regnode_1 *trieop = (struct regnode_1 *) - PerlMemShared_calloc(1, sizeof(struct regnode_1)); - StructCopy(first,trieop,struct regnode_1); - trie_op=(regnode *)trieop; - } else { - struct regnode_charclass *trieop = (struct regnode_charclass *) - PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); - StructCopy(first,trieop,struct regnode_charclass); - trie_op=(regnode *)trieop; - } - OP(trie_op)+=2; - make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0); - ri->regstclass = trie_op; + /* this can happen only on restudy */ + ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0); } #endif else if (REGNODE_SIMPLE(OP(first))) @@ -6658,31 +6848,30 @@ reStudy: PL_regkind[OP(first)] == NBOUND) ri->regstclass = first; else if (PL_regkind[OP(first)] == BOL) { - r->extflags |= (OP(first) == MBOL - ? RXf_ANCH_MBOL + r->intflags |= (OP(first) == MBOL + ? PREGf_ANCH_MBOL : (OP(first) == SBOL - ? RXf_ANCH_SBOL - : RXf_ANCH_BOL)); + ? PREGf_ANCH_SBOL + : PREGf_ANCH_BOL)); first = NEXTOPER(first); goto again; } else if (OP(first) == GPOS) { - r->extflags |= RXf_ANCH_GPOS; + r->intflags |= PREGf_ANCH_GPOS; first = NEXTOPER(first); goto again; } else if ((!sawopen || !RExC_sawback) && (OP(first) == STAR && PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && - !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks) + !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks) { /* turn .* into ^.* with an implied $*=1 */ const int type = (OP(NEXTOPER(first)) == REG_ANY) - ? RXf_ANCH_MBOL - : RXf_ANCH_SBOL; - r->extflags |= type; - r->intflags |= PREGf_IMPLICIT; + ? PREGf_ANCH_MBOL + : PREGf_ANCH_SBOL; + r->intflags |= (type | PREGf_IMPLICIT); first = NEXTOPER(first); goto again; } @@ -6751,9 +6940,11 @@ reStudy: if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) && data.last_start_min == 0 && data.last_end > 0 && !RExC_seen_zerolen - && !(RExC_seen & REG_SEEN_VERBARG) - && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS))) + && !(RExC_seen & REG_VERBARG_SEEN) + && !(RExC_seen & REG_GPOS_SEEN) + ){ r->extflags |= RXf_CHECK_ALL; + } scan_commit(pRExC_state, &data,&minlen,0); longest_float_length = CHR_SVLEN(data.longest_float); @@ -6827,7 +7018,7 @@ reStudy: ri->regstclass = (regnode*)RExC_rxi->data->data[n]; r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); - regprop(r, sv, (regnode*)data.start_class); + regprop(r, sv, (regnode*)data.start_class, NULL); PerlIO_printf(Perl_debug_log, "synthetic stclass \"%s\".\n", SvPVX_const(sv));}); @@ -6837,14 +7028,16 @@ reStudy: /* A temporary algorithm prefers floated substr to fixed one to dig * more info. */ if (longest_fixed_length > longest_float_length) { + r->substrs->check_ix = 0; r->check_end_shift = r->anchored_end_shift; r->check_substr = r->anchored_substr; r->check_utf8 = r->anchored_utf8; r->check_offset_min = r->check_offset_max = r->anchored_offset; - if (r->extflags & RXf_ANCH_SINGLE) - r->extflags |= RXf_NOSCAN; + if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)) + r->intflags |= PREGf_NOSCAN; } else { + r->substrs->check_ix = 1; r->check_end_shift = r->float_end_shift; r->check_substr = r->float_substr; r->check_utf8 = r->float_utf8; @@ -6856,6 +7049,8 @@ reStudy: if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) r->extflags |= RXf_INTUIT_TAIL; } + r->substrs->data[0].max_offset = r->substrs->data[0].min_offset; + /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) if ( (STRLEN)minlen < longest_float_length ) minlen= longest_float_length; @@ -6903,7 +7098,7 @@ reStudy: ri->regstclass = (regnode*)RExC_rxi->data->data[n]; r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); - regprop(r, sv, (regnode*)data.start_class); + regprop(r, sv, (regnode*)data.start_class, NULL); PerlIO_printf(Perl_debug_log, "synthetic stclass \"%s\".\n", SvPVX_const(sv));}); @@ -6911,31 +7106,39 @@ reStudy: } } + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) { + r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN; + r->maxlen = REG_INFTY; + } + else { + r->maxlen = RExC_maxlen; + } + /* Guard against an embedded (?=) or (?<=) with a longer minlen than the "real" pattern. */ DEBUG_OPTIMISE_r({ - PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n", - (IV)minlen, (IV)r->minlen); + PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n", + (IV)minlen, (IV)r->minlen, RExC_maxlen); }); r->minlenret = minlen; if (r->minlen < minlen) r->minlen = minlen; - if (RExC_seen & REG_SEEN_GPOS) - r->extflags |= RXf_GPOS_SEEN; - if (RExC_seen & REG_SEEN_LOOKBEHIND) + if (RExC_seen & REG_GPOS_SEEN) + r->intflags |= PREGf_GPOS_SEEN; + if (RExC_seen & REG_LOOKBEHIND_SEEN) r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */ if (pRExC_state->num_code_blocks) r->extflags |= RXf_EVAL_SEEN; - if (RExC_seen & REG_SEEN_CANY) - r->extflags |= RXf_CANY_SEEN; - if (RExC_seen & REG_SEEN_VERBARG) + if (RExC_seen & REG_CANY_SEEN) + r->intflags |= PREGf_CANY_SEEN; + if (RExC_seen & REG_VERBARG_SEEN) { r->intflags |= PREGf_VERBARG_SEEN; r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */ } - if (RExC_seen & REG_SEEN_CUTGROUP) + if (RExC_seen & REG_CUTGROUP_SEEN) r->intflags |= PREGf_CUTGROUP_SEEN; if (pm_flags & PMf_USE_RE_EVAL) r->intflags |= PREGf_USE_RE_EVAL; @@ -6944,7 +7147,20 @@ reStudy: else RXp_PAREN_NAMES(r) = NULL; + /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED + * so it can be used in pp.c */ + if (r->intflags & PREGf_ANCH) + r->extflags |= RXf_IS_ANCHORED; + + { + /* this is used to identify "special" patterns that might result + * in Perl NOT calling the regex engine and instead doing the match "itself", + * particularly special cases in split//. By having the regex compiler + * do this pattern matching at a regop level (instead of by inspecting the pattern) + * we avoid weird issues with equivalent patterns resulting in different behavior, + * AND we allow non Perl engines to get the same optimizations by the setting the + * flags appropriately - Yves */ regnode *first = ri->program + 1; U8 fop = OP(first); regnode *next = NEXTOPER(first); @@ -6966,6 +7182,11 @@ reStudy: r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); } + + if (RExC_contains_locale) { + RXp_EXTFLAGS(r) |= RXf_TAINTED; + } + #ifdef DEBUGGING if (RExC_paren_names) { ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a")); @@ -7193,7 +7414,7 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) } else if (flags & RXapif_ONE) { ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); av = MUTABLE_AV(SvRV(ret)); - length = av_len(av); + length = av_tindex(av); SvREFCNT_dec_NN(ret); return newSViv(length + 1); } else { @@ -7307,7 +7528,7 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, assert(s >= rx->subbeg); assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) ); if (i >= 0) { -#if NO_TAINT_SUPPORT +#ifdef NO_TAINT_SUPPORT sv_setpvn(sv, s, i); #else const int oldtainted = TAINT_get; @@ -7315,7 +7536,7 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, sv_setpvn(sv, s, i); TAINT_set(oldtainted); #endif - if ( (rx->extflags & RXf_CANY_SEEN) + if ( (rx->intflags & PREGf_CANY_SEEN) ? (RXp_MATCH_UTF8(rx) && (!i || is_utf8_string((U8*)s, i))) : (RXp_MATCH_UTF8(rx)) ) @@ -8921,6 +9142,23 @@ Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, count += 2; } } + +void +Perl__load_PL_utf8_foldclosures (pTHX) +{ + assert(! PL_utf8_foldclosures); + + /* If the folds haven't been read in, call a fold function + * to force that */ + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES_CASE+1]; + + /* This string is just a short named one above \xff */ + to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); + assert(PL_utf8_tofold); /* Verify that worked */ + } + PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); +} #endif #ifdef PERL_ARGS_ASSERT__INVLISTEQ @@ -9053,7 +9291,6 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) } cs = REGEX_LOCALE_CHARSET; has_charset_modifier = LOCALE_PAT_MOD; - RExC_contains_locale = 1; break; case UNICODE_PAT_MOD: if (has_charset_modifier) { @@ -9265,8 +9502,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) int internal_argval = 0; /* internal_argval is only useful if !argok */ - if (has_intervening_patws && SIZE_ONLY) { - ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated"); + if (has_intervening_patws) { + RExC_parse++; + vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent"); } while ( *RExC_parse && *RExC_parse != ')' ) { if ( *RExC_parse == ':' ) { @@ -9326,7 +9564,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* [19:06] :: is then */ if ( memEQs(start_verb,verb_len,"THEN") ) { op = CUTGROUP; - RExC_seen |= REG_SEEN_CUTGROUP; + RExC_seen |= REG_CUTGROUP_SEEN; } break; } @@ -9359,7 +9597,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } } if (!internal_argval) - RExC_seen |= REG_SEEN_VERBARG; + RExC_seen |= REG_VERBARG_SEEN; } else if ( start_arg ) { vFAIL3("Verb pattern '%.*s' may not have an argument", verb_len, start_verb); @@ -9372,8 +9610,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) else if (*RExC_parse == '?') { /* (?...) */ bool is_logical = 0; const char * const seqstart = RExC_parse; - if (has_intervening_patws && SIZE_ONLY) { - ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated"); + if (has_intervening_patws) { + RExC_parse++; + vFAIL("In '(?...)', the '(' and '?' must be adjacent"); } RExC_parse++; @@ -9512,9 +9751,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) paren = 1; goto capturing_parens; } - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; RExC_in_lookbehind++; RExC_parse++; + /* FALLTHROUGH */ case '=': /* (?=...) */ RExC_seen_zerolen++; break; @@ -9539,24 +9779,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '@': /* (?@...) */ vFAIL2("Sequence (?%c...) not implemented", (int)paren); break; - case '#': /* (?#...) */ - /* XXX As soon as we disallow separating the '?' and '*' (by - * spaces or (?#...) comment), it is believed that this case - * will be unreachable and can be removed. See - * [perl #117327] */ - while (*RExC_parse && *RExC_parse != ')') - RExC_parse++; - if (*RExC_parse != ')') - FAIL("Sequence (?#... not terminated"); - nextchar(pRExC_state); - *flagp = TRYAGAIN; - return NULL; case '0' : /* (?0) */ case 'R' : /* (?R) */ if (*RExC_parse != ')') FAIL("Sequence (?R) not terminated"); ret = reg_node(pRExC_state, GOSTART); - RExC_seen |= REG_SEEN_GOSTART; + RExC_seen |= REG_GOSTART_SEEN; *flagp |= POSTPONED; nextchar(pRExC_state); return ret; @@ -9587,7 +9815,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse--; /* rewind to let it be handled later */ goto parse_flags; } - /*FALLTHROUGH */ + /* FALLTHROUGH */ case '1': case '2': case '3': case '4': /* (?1) */ case '5': case '6': case '7': case '8': case '9': RExC_parse--; @@ -9637,7 +9865,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else { RExC_size++; } - RExC_seen |= REG_SEEN_RECURSE; + RExC_seen |= REG_RECURSE_SEEN; Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ Set_Node_Offset(ret, parse_start); /* MJD */ @@ -9659,7 +9887,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } *flagp |= POSTPONED; paren = *RExC_parse++; - /* FALL THROUGH */ + /* FALLTHROUGH */ case '{': /* (?{...}) */ { U32 n = 0; @@ -9897,7 +10125,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY ){ if (!RExC_nestroot) RExC_nestroot = parno; - if (RExC_seen & REG_SEEN_RECURSE + if (RExC_seen & REG_RECURSE_SEEN && !RExC_open_parens[parno-1]) { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, @@ -9988,7 +10216,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) break; case 1: case 2: ender = reganode(pRExC_state, CLOSE, parno); - if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) { + if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, "Setting close paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ender))); @@ -10004,7 +10232,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '=': case '!': *flagp &= ~HASWIDTH; - /* FALL THROUGH */ + /* FALLTHROUGH */ case '>': ender = reg_node(pRExC_state, SUCCEED); break; @@ -10020,8 +10248,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV * const mysv_val1=sv_newmortal(); SV * const mysv_val2=sv_newmortal(); DEBUG_PARSE_MSG("lsbr"); - regprop(RExC_rx, mysv_val1, lastbr); - regprop(RExC_rx, mysv_val2, ender); + regprop(RExC_rx, mysv_val1, lastbr, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", SvPV_nolen_const(mysv_val1), (IV)REG_NODE_NUM(lastbr), @@ -10035,7 +10263,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (have_branch && !SIZE_ONLY) { char is_nothing= 1; if (depth==1) - RExC_seen |= REG_TOP_LEVEL_BRANCHES; + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; /* Hook the tails of the branches to the closing node. */ for (br = ret; br; br = regnext(br)) { @@ -10061,8 +10289,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV * const mysv_val1=sv_newmortal(); SV * const mysv_val2=sv_newmortal(); DEBUG_PARSE_MSG("NADA"); - regprop(RExC_rx, mysv_val1, ret); - regprop(RExC_rx, mysv_val2, ender); + regprop(RExC_rx, mysv_val1, ret, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", SvPV_nolen_const(mysv_val1), (IV)REG_NODE_NUM(ret), @@ -10358,6 +10586,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ARG1_SET(ret, (U16)min); ARG2_SET(ret, (U16)max); } + if (max == REG_INFTY) + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; goto nest_check; } @@ -10395,6 +10625,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reginsert(pRExC_state, STAR, ret, depth+1); ret->flags = 0; RExC_naughty += 4; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; } else if (op == '*') { min = 0; @@ -10404,6 +10635,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reginsert(pRExC_state, PLUS, ret, depth+1); ret->flags = 0; RExC_naughty += 3; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; } else if (op == '+') { min = 1; @@ -10522,7 +10754,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, * modifier. The other meaning does not, so use a temporary until we find * out which we are being called with */ p = (RExC_flags & RXf_PMf_EXTENDED) - ? regwhite( pRExC_state, RExC_parse ) + ? regpatws(pRExC_state, RExC_parse, + TRUE) /* means recognize comments */ : RExC_parse; /* Disambiguate between \N meaning a named character versus \N meaning @@ -10770,7 +11003,8 @@ S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state) PERL_STATIC_INLINE void S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, - regnode *node, I32* flagp, STRLEN len, UV code_point) + regnode *node, I32* flagp, STRLEN len, UV code_point, + bool downgradable) { /* This knows the details about sizing an EXACTish node, setting flags for * it (by setting <*flagp>, and potentially populating it with a single @@ -10792,13 +11026,24 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, * * It knows that under FOLD, the Latin Sharp S and UTF characters above * 255, must be folded (the former only when the rules indicate it can - * match 'ss') */ + * match 'ss') + * + * When it does the populating, it looks at the flag 'downgradable'. If + * true with a node that folds, it checks if the single code point + * participates in a fold, and if not downgrades the node to an EXACT. + * This helps the optimizer */ bool len_passed_in = cBOOL(len != 0); U8 character[UTF8_MAXBYTES_CASE+1]; PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; + /* Don't bother to check for downgrading in PASS1, as it doesn't make any + * sizing difference, and is extra work that is thrown away */ + if (downgradable && ! PASS2) { + downgradable = FALSE; + } + if (! len_passed_in) { if (UTF) { if (UNI_IS_INVARIANT(code_point)) { @@ -10810,19 +11055,31 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, EBCDIC, but it works there, as the extra invariants fold to themselves) */ *character = toFOLD((U8) code_point); + if (downgradable + && *character == code_point + && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point)) + { + OP(node) = EXACT; + } } len = 1; } - else if (FOLD && (! LOC || code_point > 255)) { - /* Folding, and ok to do so now */ - _to_uni_fold_flags(code_point, + else if (FOLD && (! LOC + || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point))) + { /* Folding, and ok to do so now */ + UV folded = _to_uni_fold_flags( + code_point, character, &len, - FOLD_FLAGS_FULL | ((LOC) - ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0)); + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + if (downgradable + && folded == code_point + && ! _invlist_contains_cp(PL_utf8_foldable, code_point)) + { + OP(node) = EXACT; + } } else if (code_point <= MAX_UTF8_TWO_BYTE) { @@ -10835,19 +11092,36 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, uvchr_to_utf8( character, code_point); len = UTF8SKIP(character); } - } /* Else pattern isn't UTF8. We only fold the sharp s, when - appropriate */ - else if (UNLIKELY(code_point == LATIN_SMALL_LETTER_SHARP_S) - && FOLD - && AT_LEAST_UNI_SEMANTICS - && ! ASCII_FOLD_RESTRICTED) - { + } /* Else pattern isn't UTF8. */ + else if (! FOLD) { + *character = (U8) code_point; + len = 1; + } /* Else is folded non-UTF8 */ + else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { + + /* We don't fold any non-UTF8 except possibly the Sharp s (see + * comments at join_exact()); */ + *character = (U8) code_point; + len = 1; + + /* Can turn into an EXACT node if we know the fold at compile time, + * and it folds to itself and doesn't particpate in other folds */ + if (downgradable + && ! LOC + && PL_fold_latin1[code_point] == code_point + && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point) + || (isASCII(code_point) && ASCII_FOLD_RESTRICTED))) + { + OP(node) = EXACT; + } + } /* else is Sharp s. May need to fold it */ + else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) { *character = 's'; *(character + 1) = 's'; len = 2; } else { - *character = (U8) code_point; + *character = LATIN_SMALL_LETTER_SHARP_S; len = 1; } } @@ -10873,6 +11147,11 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, { *flagp |= SIMPLE; } + + /* The OP may not be well defined in PASS1 */ + if (PASS2 && OP(node) == EXACTFL) { + RExC_contains_locale = 1; + } } @@ -10884,7 +11163,7 @@ S_backref_value(char *p) { char *q = p; - for (;isDIGIT(*q); q++); /* calculate length of num */ + for (;isDIGIT(*q); q++) {} /* calculate length of num */ if (q - p == 0 || q - p > 9) return I32_MAX; return atoi(p); @@ -11067,7 +11346,7 @@ tryagain: RExC_parse++; goto defchar; } - /* FALL THROUGH */ + /* FALLTHROUGH */ case '?': case '+': case '*': @@ -11097,7 +11376,7 @@ tryagain: goto finish_meta_pat; case 'G': ret = reg_node(pRExC_state, GPOS); - RExC_seen |= REG_SEEN_GPOS; + RExC_seen |= REG_GPOS_SEEN; *flagp |= SIMPLE; goto finish_meta_pat; case 'K': @@ -11108,7 +11387,7 @@ tryagain: * be necessary here to avoid cases of memory corruption, as * with: C<$_="x" x 80; s/x\K/y/> -- rgs */ - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; goto finish_meta_pat; case 'Z': ret = reg_node(pRExC_state, SEOL); @@ -11122,7 +11401,7 @@ tryagain: goto finish_meta_pat; case 'C': ret = reg_node(pRExC_state, CANY); - RExC_seen |= REG_SEEN_CANY; + RExC_seen |= REG_CANY_SEEN; *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'X': @@ -11139,30 +11418,38 @@ tryagain: case 'b': RExC_seen_zerolen++; - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; op = BOUND + get_regex_charset(RExC_flags); if (op > BOUNDA) { /* /aa is same as /a */ op = BOUNDA; } + else if (op == BOUNDL) { + RExC_contains_locale = 1; + } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { - ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead"); + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\b\\{\" instead of \"\\b{\""); } goto finish_meta_pat; case 'B': RExC_seen_zerolen++; - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; op = NBOUND + get_regex_charset(RExC_flags); if (op > NBOUNDA) { /* /aa is same as /a */ op = NBOUNDA; } + else if (op == NBOUNDL) { + RExC_contains_locale = 1; + } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { - ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead"); + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\B\\{\" instead of \"\\B{\""); } goto finish_meta_pat; @@ -11206,6 +11493,9 @@ tryagain: if (op > POSIXA) { /* /aa is same as /a */ op = POSIXA; } + else if (op == POSIXL) { + RExC_contains_locale = 1; + } join_posix_op_known: @@ -11219,7 +11509,7 @@ tryagain: } *flagp |= HASWIDTH|SIMPLE; - /* FALL THROUGH */ + /* FALLTHROUGH */ finish_meta_pat: nextchar(pRExC_state); @@ -11365,11 +11655,15 @@ tryagain: } else { num = S_backref_value(RExC_parse); - /* bare \NNN might be backref or octal */ + /* bare \NNN might be backref or octal - if it is larger than or equal + * RExC_npar then it is assumed to be and octal escape. + * Note RExC_npar is +1 from the actual number of parens*/ if (num == I32_MAX || (num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9')) + { /* Probably a character specified in octal, e.g. \35 */ goto defchar; + } } /* at this point RExC_parse definitely points to a backref @@ -11414,7 +11708,7 @@ tryagain: case '\0': if (RExC_parse >= RExC_end) FAIL("Trailing \\"); - /* FALL THROUGH */ + /* FALLTHROUGH */ default: /* Do not generate "unrecognized" warnings here, we fall back into the quick-grab loop below */ @@ -11425,10 +11719,11 @@ tryagain: case '#': if (RExC_flags & RXf_PMf_EXTENDED) { - if ( reg_skipcomment( pRExC_state ) ) + RExC_parse = reg_skipcomment( pRExC_state, RExC_parse ); + if (RExC_parse < RExC_end) goto tryagain; } - /* FALL THROUGH */ + /* FALLTHROUGH */ default: @@ -11445,7 +11740,6 @@ tryagain: char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE]; char *s0; U8 upper_parse = MAX_NODE_STRING_SIZE; - STRLEN foldlen; U8 node_type = compute_EXACTish(pRExC_state); bool next_is_quantifier; char * oldp = NULL; @@ -11453,9 +11747,15 @@ tryagain: /* We can convert EXACTF nodes to EXACTFU if they contain only * characters that match identically regardless of the target * string's UTF8ness. The reason to do this is that EXACTF is not - * trie-able, EXACTFU is. (We don't need to figure this out until - * pass 2) */ - bool maybe_exactfu = node_type == EXACTF && PASS2; + * trie-able, EXACTFU is. + * + * Similarly, we can convert EXACTFL nodes to EXACTFU if they + * contain only above-Latin1 characters (hence must be in UTF8), + * which don't participate in folds with Latin1-range characters, + * as the latter's folds aren't known until runtime. (We don't + * need to figure this out until pass 2) */ + bool maybe_exactfu = PASS2 + && (node_type == EXACTF || node_type == EXACTFL); /* If a folding node contains only code points that don't * participate in folds, it can be changed into an EXACT node, @@ -11472,10 +11772,9 @@ tryagain: reparse: - /* We do the EXACTFish to EXACT node only if folding, and not if in - * locale, as whether a character folds or not isn't known until - * runtime. (And we don't need to figure this out until pass 2) */ - maybe_exact = FOLD && ! LOC && PASS2; + /* We do the EXACTFish to EXACT node only if folding. (And we + * don't need to figure this out until pass 2) */ + maybe_exact = FOLD && PASS2; /* XXX The node can hold up to 255 bytes, yet this only goes to * 127. I (khw) do not know why. Keeping it somewhat less than @@ -11502,7 +11801,8 @@ tryagain: oldp = p; if (RExC_flags & RXf_PMf_EXTENDED) - p = regwhite( pRExC_state, p ); + p = regpatws(pRExC_state, p, + TRUE); /* means recognize comments */ switch ((U8)*p) { case '^': case '$': @@ -11652,7 +11952,7 @@ tryagain: } case 'c': p++; - ender = grok_bslash_c(*p++, UTF, SIZE_ONLY); + ender = grok_bslash_c(*p++, SIZE_ONLY); break; case '8': case '9': /* must be a backreference */ --p; @@ -11671,12 +11971,17 @@ tryagain: * 118 OR as "\11" . "8" depending on whether there * were 118 capture buffers defined already in the * pattern. */ - if ( !isDIGIT(p[1]) || S_backref_value(p) <= RExC_npar) + + /* NOTE, RExC_npar is 1 more than the actual number of + * parens we have seen so far, hence the < RExC_npar below. */ + + if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar) { /* Not to be treated as an octal constant, go find backref */ --p; goto loopdone; } + /* FALLTHROUGH */ case '0': { I32 flags = PERL_SCAN_SILENT_ILLDIGIT; @@ -11711,7 +12016,7 @@ tryagain: case '\0': if (p >= RExC_end) FAIL("Trailing \\"); - /* FALL THROUGH */ + /* FALLTHROUGH */ default: if (!SIZE_ONLY&& isALPHANUMERIC(*p)) { /* Include any { following the alpha to emphasize @@ -11725,15 +12030,6 @@ tryagain: break; default: /* A literal character */ - if (! SIZE_ONLY - && RExC_flags & RXf_PMf_EXTENDED - && ckWARN_d(WARN_DEPRECATED) - && is_PATWS_non_low(p, UTF)) - { - vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1), - "Escape literal pattern white space under /x"); - } - normal_default: if (UTF8_IS_START(*p) && UTF) { STRLEN numlen; @@ -11751,7 +12047,8 @@ tryagain: */ if ( RExC_flags & RXf_PMf_EXTENDED) - p = regwhite( pRExC_state, p ); + p = regpatws(pRExC_state, p, + TRUE); /* means recognize comments */ /* If the next thing is a quantifier, it applies to this * character only, which means that this character has to be in @@ -11766,7 +12063,10 @@ tryagain: goto loopdone; } - if (! FOLD) { /* The simple case, just append the literal */ + if (! FOLD /* The simple case, just append the literal */ + || (LOC /* Also don't fold for tricky chars under /l */ + && is_PROBLEMATIC_LOCALE_FOLD_cp(ender))) + { if (UTF) { const STRLEN unilen = reguni(pRExC_state, ender, s); if (unilen > 0) { @@ -11784,15 +12084,27 @@ tryagain: else { REGC((char)ender, s++); } + + /* Can get here if folding only if is one of the /l + * characters whose fold depends on the locale. The + * occurrence of any of these indicate that we can't + * simplify things */ + if (FOLD) { + maybe_exact = FALSE; + maybe_exactfu = FALSE; + } } - else /* FOLD */ if (! ( UTF + else /* FOLD */ + if (! ( UTF /* See comments for join_exact() as to why we fold this * non-UTF at compile time */ || (node_type == EXACTFU && ender == LATIN_SMALL_LETTER_SHARP_S))) { /* Here, are folding and are not UTF-8 encoded; therefore - * the character must be in the range 0-255. */ + * the character must be in the range 0-255, and is not /l + * (Not /l because we already handled these under /l in + * is_PROBLEMATIC_LOCALE_FOLD_cp */ if (IS_IN_SOME_FOLD_L1(ender)) { maybe_exact = FALSE; @@ -11822,62 +12134,50 @@ tryagain: * unfolded, and we have to calculate how many EXACTish * nodes it will take; and we may run out of room in a node * in the middle of a potential multi-char fold, and have - * to back off accordingly. */ - - /* Prime the casefolded buffer. Locale rules, which apply - * only to code points < 256, aren't known until execution, - * so for them, just output the original character using - * utf8. If we start to fold non-UTF patterns, be sure to - * update join_exact() */ - if (LOC && ender < 256) { - if (UVCHR_IS_INVARIANT(ender)) { - *s = (U8) ender; - foldlen = 1; - } else { - *s = UTF8_TWO_BYTE_HI(ender); - *(s + 1) = UTF8_TWO_BYTE_LO(ender); - foldlen = 2; - } + * to back off accordingly. (Hence we can't use REGC for + * the simple case just below.) */ + + UV folded; + if (isASCII(ender)) { + folded = toFOLD(ender); + *(s)++ = (U8) folded; + } + else { + STRLEN foldlen; + + folded = _to_uni_fold_flags( + ender, + (U8 *) s, + &foldlen, + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + s += foldlen; + + /* The loop increments each time, as all but this + * path (and one other) through it add a single byte to + * the EXACTish node. But this one has changed len to + * be the correct final value, so subtract one to + * cancel out the increment that follows */ + len += foldlen - 1; } - else { /* ender >= 256 */ - UV folded = _to_uni_fold_flags( - ender, - (U8 *) s, - &foldlen, - FOLD_FLAGS_FULL - | ((LOC) ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0) - ); - - /* If this node only contains non-folding code points - * so far, see if this new one is also non-folding */ - if (maybe_exact) { - if (folded != ender) { + /* If this node only contains non-folding code points so + * far, see if this new one is also non-folding */ + if (maybe_exact) { + if (folded != ender) { + maybe_exact = FALSE; + } + else { + /* Here the fold is the original; we have to check + * further to see if anything folds to it */ + if (_invlist_contains_cp(PL_utf8_foldable, + ender)) + { maybe_exact = FALSE; } - else { - /* Here the fold is the original; we have - * to check further to see if anything - * folds to it */ - if (_invlist_contains_cp(PL_utf8_foldable, - ender)) - { - maybe_exact = FALSE; - } - } } - ender = folded; } - s += foldlen; - - /* The loop increments each time, as all but this - * path (and one other) through it add a single byte to the - * EXACTish node. But this one has changed len to be the - * correct final value, so subtract one to cancel out the - * increment that follows */ - len += foldlen - 1; + ender = folded; } if (next_is_quantifier) { @@ -11926,9 +12226,8 @@ tryagain: if (! UTF) { - /* These two have no multi-char folds to non-UTF characters - */ - if (ASCII_FOLD_RESTRICTED || LOC) { + /* This has no multi-char folds to non-UTF characters */ + if (ASCII_FOLD_RESTRICTED) { goto loopdone; } @@ -11959,11 +12258,7 @@ tryagain: } } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { - - /* No Latin1 characters participate in multi-char - * folds under /l */ - if (LOC - || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( + if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( *s, *(s+1)))) { break; @@ -12074,7 +12369,7 @@ tryagain: * code points in the node that participate in folds; * similarly for 'maybe_exactfu' and code points that match * differently depending on UTF8ness of the target string - * */ + * (for /u), or depending on locale for /l */ if (maybe_exact) { OP(ret) = EXACT; } @@ -12082,7 +12377,12 @@ tryagain: OP(ret) = EXACTFU; } } - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender, + FALSE /* Don't look to see if could + be turned into an EXACT + node, as we have already + computed that */ + ); } RExC_parse = p - 1; @@ -12103,39 +12403,11 @@ tryagain: } STATIC char * -S_regwhite( RExC_state_t *pRExC_state, char *p ) -{ - const char *e = RExC_end; - - PERL_ARGS_ASSERT_REGWHITE; - - while (p < e) { - if (isSPACE(*p)) - ++p; - else if (*p == '#') { - bool ended = 0; - do { - if (*p++ == '\n') { - ended = 1; - break; - } - } while (p < e); - if (!ended) - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; - } - else - break; - } - return p; -} - -STATIC char * -S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) +S_regpatws(pTHX_ RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) { /* Returns the next non-pattern-white space, non-comment character (the * latter only if 'recognize_comment is true) in the string p, which is - * ended by RExC_end. If there is no line break ending a comment, - * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */ + * ended by RExC_end. See also reg_skipcomment */ const char *e = RExC_end; PERL_ARGS_ASSERT_REGPATWS; @@ -12146,16 +12418,7 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) p += len; } else if (recognize_comment && *p == '#') { - bool ended = 0; - do { - p++; - if (is_LNBREAK_safe(p, e, UTF)) { - ended = 1; - break; - } - } while (p < e); - if (!ended) - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + p = reg_skipcomment(pRExC_state, p); } else break; @@ -12190,6 +12453,9 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) if (end == UV_MAX && start <= 256) { ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL; } + else if (end >= 256) { + ANYOF_FLAGS(node) |= ANYOF_UTF8; + } /* Quit if are above what we should change */ if (start > 255) { @@ -12480,11 +12746,11 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, while (RExC_parse < RExC_end) { SV* current = NULL; RExC_parse = regpatws(pRExC_state, RExC_parse, - TRUE); /* means recognize comments */ + TRUE); /* means recognize comments */ switch (*RExC_parse) { case '?': if (RExC_parse[1] == '[') depth++, RExC_parse++; - /* FALL THROUGH */ + /* FALLTHROUGH */ default: break; case '\\': @@ -12597,7 +12863,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, /* Skip white space */ RExC_parse = regpatws(pRExC_state, RExC_parse, - TRUE); /* means recognize comments */ + TRUE /* means recognize comments */ ); if (RExC_parse >= RExC_end) { Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'"); } @@ -12671,7 +12937,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, RExC_flags = save_flags; goto handle_operand; } - /* FALL THROUGH */ + /* FALLTHROUGH */ default: RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; @@ -12767,7 +13033,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, top_index -= 2; SvREFCNT_dec_NN(lparen); - /* FALL THROUGH */ + /* FALLTHROUGH */ } handle_operand: @@ -12926,6 +13192,74 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, } #undef IS_OPERAND +STATIC void +S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) +{ + /* This hard-codes the Latin1/above-Latin1 folding rules, so that an + * innocent-looking character class, like /[ks]/i won't have to go out to + * disk to find the possible matches. + * + * This should be called only for a Latin1-range code points, cp, which is + * known to be involved in a fold with other code points above Latin1. It + * would give false results if /aa has been specified. Multi-char folds + * are outside the scope of this, and must be handled specially. + * + * XXX It would be better to generate these via regen, in case a new + * version of the Unicode standard adds new mappings, though that is not + * really likely, and may be caught by the default: case of the switch + * below. */ + + PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS; + + switch (cp) { + case 'k': + case 'K': + *invlist = + add_cp_to_invlist(*invlist, KELVIN_SIGN); + break; + case 's': + case 'S': + *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S); + break; + case MICRO_SIGN: + *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU); + *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU); + break; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN); + break; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + *invlist = add_cp_to_invlist(*invlist, + LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); + break; + case LATIN_SMALL_LETTER_SHARP_S: + *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S); + break; + case 'F': case 'f': + case 'I': case 'i': + case 'L': case 'l': + case 'T': case 't': + case 'A': case 'a': + case 'H': case 'h': + case 'J': case 'j': + case 'N': case 'n': + case 'W': case 'w': + case 'Y': case 'y': + /* These all are targets of multi-character folds from code points + * that require UTF8 to express, so they can't match unless the + * target string is in UTF-8, so no action here is necessary, as + * regexec.c properly handles the general case for UTF-8 matching + * and multi-char folds */ + break; + default: + /* Use deprecated warning to increase the chances of this being + * output */ + ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp); + break; + } +} + /* The names of properties whose definitions are not known at compile time are * stored in this SV, after a constant heading. So if the length has been * changed since initialization, then there is a run-time definition. */ @@ -13024,6 +13358,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * that fold to/from them under /i */ SV* cp_foldable_list = NULL; + /* Like cp_list, but code points on this list are valid only when the + * runtime locale is UTF-8 */ + SV* only_utf8_locale_list = NULL; + #ifdef EBCDIC /* In a range, counts how many 0-2 of the ends of it came from literals, * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ @@ -13058,9 +13396,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, ANYOF_FLAGS(ret) = 0; RExC_emit += ANYOF_SKIP; - if (LOC) { - ANYOF_FLAGS(ret) |= ANYOF_LOCALE; - } listsv = newSVpvs_flags("# comment\n", SVs_TEMP); initial_listsv_len = SvCUR(listsv); SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ @@ -13068,7 +13403,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); + FALSE /* means don't recognize comments */ ); } if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ @@ -13078,7 +13413,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, RExC_naughty++; if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); + FALSE /* means don't recognize comments */ ); } } @@ -13116,7 +13451,7 @@ parseit: if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); + FALSE /* means don't recognize comments */ ); } if (UCHARAT(RExC_parse) == ']') { @@ -13217,12 +13552,12 @@ parseit: e = strchr(RExC_parse++, '}'); if (!e) vFAIL2("Missing right brace on \\%c{}", c); - while (isSPACE(UCHARAT(RExC_parse))) + while (isSPACE(*RExC_parse)) RExC_parse++; if (e == RExC_parse) vFAIL2("Empty \\%c{}", c); n = e - RExC_parse; - while (isSPACE(UCHARAT(RExC_parse + n - 1))) + while (isSPACE(*(RExC_parse + n - 1))) n--; } else { @@ -13231,7 +13566,6 @@ parseit: } if (!SIZE_ONLY) { SV* invlist; - char* formatted; char* name; if (UCHARAT(RExC_parse) == '^') { @@ -13242,7 +13576,7 @@ parseit: * that bit) */ value ^= 'P' ^ 'p'; - while (isSPACE(UCHARAT(RExC_parse))) { + while (isSPACE(*RExC_parse)) { RExC_parse++; n--; } @@ -13252,14 +13586,13 @@ parseit: * will have its name be <__NAME_i>. The design is * discussed in commit * 2f833f5208e26b208886e51e09e2c072b5eabb46 */ - formatted = Perl_form(aTHX_ + name = savepv(Perl_form(aTHX_ "%s%.*s%s\n", (FOLD) ? "__" : "", (int)n, RExC_parse, (FOLD) ? "_i" : "" - ); - name = savepvn(formatted, strlen(formatted)); + )); /* Look up the property name, and get its swash and * inversion list, if the property is found */ @@ -13288,6 +13621,19 @@ parseit: "Property '%"UTF8f"' is unknown", UTF8fARG(UTF, n, name)); } + + /* If the property name doesn't already have a package + * name, add the current one to it so that it can be + * referred to outside it. [perl #121777] */ + if (! instr(name, "::") && PL_curstash) { + char* full_name = Perl_form(aTHX_ + "%s::%s", + HvNAME(PL_curstash), + name); + n = strlen(full_name); + Safefree(name); + name = savepvn(full_name, n); + } Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n", (value == 'p' ? '+' : '!'), UTF8fARG(UTF, n, name)); @@ -13397,7 +13743,7 @@ parseit: goto recode_encoding; break; case 'c': - value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY); + value = grok_bslash_c(*RExC_parse++, SIZE_ONLY); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': @@ -13469,31 +13815,6 @@ parseit: /* Here, we have the current token in 'value' */ - /* What matches in a locale is not known until runtime. This includes - * what the Posix classes (like \w, [:space:]) match. Room must be - * reserved (one time per outer bracketed class) to store such classes, - * either if Perl is compiled so that locale nodes always should have - * this space, or if there is such posix class info to be stored. The - * space will contain a bit for each named class that is to be matched - * against. This isn't needed for \p{} and pseudo-classes, as they are - * not affected by locale, and hence are dealt with separately */ - if (LOC - && ! need_class - && (ANYOF_LOCALE == ANYOF_POSIXL - || (namedclass > OOB_NAMEDCLASS - && namedclass < ANYOF_POSIXL_MAX))) - { - need_class = 1; - if (SIZE_ONLY) { - RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP; - } - else { - RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP; - } - ANYOF_POSIXL_ZERO(ret); - ANYOF_FLAGS(ret) |= ANYOF_POSIXL; - } - if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ U8 classnum; @@ -13532,10 +13853,30 @@ parseit: #ifndef HAS_ISASCII && classnum != _CC_ASCII #endif -#ifndef HAS_ISBLANK - && classnum != _CC_BLANK -#endif ) { + /* What the Posix classes (like \w, [:space:]) match in locale + * isn't knowable under locale until actual match time. Room + * must be reserved (one time per outer bracketed class) to + * store such classes. The space will contain a bit for each + * named class that is to be matched against. This isn't + * needed for \p{} and pseudo-classes, as they are not affected + * by locale, and hence are dealt with separately */ + if (! need_class) { + need_class = 1; + if (SIZE_ONLY) { + RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + else { + RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + ANYOF_FLAGS(ret) |= ANYOF_POSIXL; + ANYOF_POSIXL_ZERO(ret); + } + + /* Coverity thinks it is possible for this to be negative; both + * jhi and khw think it's not, but be safer */ + assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL) + || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0); /* See if it already matches the complement of this POSIX * class */ @@ -13618,16 +13959,6 @@ parseit: ? &posixes : &nposixes; SV** source_ptr = &PL_XPosix_ptrs[classnum]; -#ifndef HAS_ISBLANK - /* If the platform doesn't have isblank(), we handle locale - * with the hardcoded ASII values. */ - if (LOC && classnum == _CC_BLANK) { - _invlist_subtract(*source_ptr, - PL_UpperLatin1, - source_ptr); - } -#endif - _invlist_union_maybe_complement_2nd( *posixes_ptr, *source_ptr, @@ -13647,7 +13978,7 @@ parseit: if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); + FALSE /* means don't recognize comments */ ); } if (range) { @@ -13736,11 +14067,9 @@ parseit: value, foldbuf, &foldlen, - FOLD_FLAGS_FULL - | ((LOC) ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0) + FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED + ? FOLD_FLAGS_NOMIX_ASCII + : 0) ); /* Here, should be the first character of the @@ -13759,7 +14088,7 @@ parseit: AV* this_array; STRLEN cp_count = utf8_length(foldbuf, foldbuf + foldlen); - SV* multi_fold = sv_2mortal(newSVpvn("", 0)); + SV* multi_fold = sv_2mortal(newSVpvs("")); Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value); @@ -13826,7 +14155,7 @@ parseit: && ((prevvalue >= 'a' && value <= 'z') || (prevvalue >= 'A' && value <= 'Z'))) { - _invlist_intersection(this_range, PL_ASCII, + _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII], &this_range); /* Since this above only contains ascii, the intersection of it @@ -13864,7 +14193,7 @@ parseit: #endif /* Look at the longest folds first */ - for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) { + for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) { if (av_exists(multi_char_matches, cp_count)) { AV** this_array_ptr; @@ -13997,14 +14326,6 @@ parseit: if (op > POSIXA) { /* /aa is same as /a */ op = POSIXA; } -#ifndef HAS_ISBLANK - if (op == POSIXL - && (namedclass == ANYOF_BLANK - || namedclass == ANYOF_NBLANK)) - { - op = POSIXA; - } -#endif join_posix: /* The odd numbered ones are the complements of the @@ -14042,6 +14363,26 @@ parseit: op = POSIXA; } } + else if (prevvalue == 'A') { + if (value == 'Z' +#ifdef EBCDIC + && literal_endpoint == 2 +#endif + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_UPPER; + op = POSIXA; + } + } + else if (prevvalue == 'a') { + if (value == 'z' +#ifdef EBCDIC + && literal_endpoint == 2 +#endif + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_LOWER; + op = POSIXA; + } + } } /* Here, we have changed away from its initial value iff we found @@ -14066,6 +14407,9 @@ parseit: else { RExC_emit = (regnode *)orig_emit; if (PL_regkind[op] == POSIXD) { + if (op == POSIXL) { + RExC_contains_locale = 1; + } if (invert) { op += NPOSIXD - POSIXD; } @@ -14081,7 +14425,9 @@ parseit: *flagp |= HASWIDTH|SIMPLE; } else if (PL_regkind[op] == EXACT) { - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); } RExC_parse = (char *) cur_parse; @@ -14105,6 +14451,18 @@ parseit: UV start, end; /* End points of code point ranges */ SV* fold_intersection = NULL; + SV** use_list; + + /* Our calculated list will be for Unicode rules. For locale + * matching, we have to keep a separate list that is consulted at + * runtime only when the locale indicates Unicode rules. For + * non-locale, we just use to the general list */ + if (LOC) { + use_list = &only_utf8_locale_list; + } + else { + use_list = &cp_list; + } /* Only the characters in this class that participate in folds need * be checked. Get the intersection of this class and all the @@ -14120,18 +14478,7 @@ parseit: /* This is a hash that for a particular fold gives all * characters that are involved in it */ if (! PL_utf8_foldclosures) { - - /* If the folds haven't been read in, call a fold function - * to force that */ - if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES_CASE+1]; - - /* This string is just a short named one above \xff */ - to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); - assert(PL_utf8_tofold); /* Verify that worked */ - } - PL_utf8_foldclosures - = _swash_inversion_hash(PL_utf8_tofold); + _load_PL_utf8_foldclosures(); } } @@ -14140,36 +14487,21 @@ parseit: while (invlist_iternext(fold_intersection, &start, &end)) { UV j; - /* Locale folding for Latin1 characters is deferred until - * runtime */ - if (LOC && start < 256) { - start = 256; - } - /* Look at every character in the range */ for (j = start; j <= end; j++) { - U8 foldbuf[UTF8_MAXBYTES_CASE+1]; STRLEN foldlen; SV** listp; if (j < 256) { - /* We have the latin1 folding rules hard-coded here so - * that an innocent-looking character class, like - * /[ks]/i won't have to go out to disk to find the - * possible matches. XXX It would be better to - * generate these via regen, in case a new version of - * the Unicode standard adds new mappings, though that - * is not really likely, and may be caught by the - * default: case of the switch below. */ - if (IS_IN_SOME_FOLD_L1(j)) { /* ASCII is always matched; non-ASCII is matched - * only under Unicode rules */ - if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) { - cp_list = add_cp_to_invlist(cp_list, + * only under Unicode rules (which could happen + * under /l if the locale is a UTF-8 one */ + if (isASCII(j) || ! DEPENDS_SEMANTICS) { + *use_list = add_cp_to_invlist(*use_list, PL_fold_latin1[j]); } else { @@ -14179,73 +14511,12 @@ parseit: } } - if (HAS_NONLATIN1_FOLD_CLOSURE(j) + if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j) && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) { - /* Certain Latin1 characters have matches outside - * Latin1. To get here, is one of those - * characters. None of these matches is valid for - * ASCII characters under /aa, which is why the 'if' - * just above excludes those. These matches only - * happen when the target string is utf8. The code - * below adds the single fold closures for to the - * inversion list. */ - - switch (j) { - case 'k': - case 'K': - cp_list = - add_cp_to_invlist(cp_list, KELVIN_SIGN); - break; - case 's': - case 'S': - cp_list = add_cp_to_invlist(cp_list, - LATIN_SMALL_LETTER_LONG_S); - break; - case MICRO_SIGN: - cp_list = add_cp_to_invlist(cp_list, - GREEK_CAPITAL_LETTER_MU); - cp_list = add_cp_to_invlist(cp_list, - GREEK_SMALL_LETTER_MU); - break; - case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: - case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: - cp_list = - add_cp_to_invlist(cp_list, - ANGSTROM_SIGN); - break; - case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: - cp_list = add_cp_to_invlist(cp_list, - LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); - break; - case LATIN_SMALL_LETTER_SHARP_S: - cp_list = add_cp_to_invlist(cp_list, - LATIN_CAPITAL_LETTER_SHARP_S); - break; - case 'F': case 'f': - case 'I': case 'i': - case 'L': case 'l': - case 'T': case 't': - case 'A': case 'a': - case 'H': case 'h': - case 'J': case 'j': - case 'N': case 'n': - case 'W': case 'w': - case 'Y': case 'y': - /* These all are targets of multi-character - * folds from code points that require UTF8 - * to express, so they can't match unless - * the target string is in UTF-8, so no - * action here is necessary, as regexec.c - * properly handles the general case for - * UTF-8 matching and multi-char folds */ - break; - default: - /* Use deprecated warning to increase the - * chances of this being output */ - ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); - break; - } + add_above_Latin1_folds(pRExC_state, + (U8) j, + use_list); } continue; } @@ -14255,11 +14526,9 @@ parseit: * the simple fold, as the multi-character folds have been * handled earlier and separated out */ _to_uni_fold_flags(j, foldbuf, &foldlen, - ((LOC) - ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0)); + (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0); /* Single character fold of above Latin1. Add everything in * its fold closure to the list that this node should match. @@ -14273,31 +14542,37 @@ parseit: { AV* list = (AV*) *listp; IV k; - for (k = 0; k <= av_len(list); k++) { + for (k = 0; k <= av_tindex(list); k++) { SV** c_p = av_fetch(list, k, FALSE); UV c; - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); - } + assert(c_p); + c = SvUV(*c_p); - /* /aa doesn't allow folds between ASCII and non-; - * /l doesn't allow them between above and below - * 256 */ + /* /aa doesn't allow folds between ASCII and non- */ if ((ASCII_FOLD_RESTRICTED - && (isASCII(c) != isASCII(j))) - || (LOC && c < 256)) { + && (isASCII(c) != isASCII(j)))) + { + continue; + } + + /* Folds under /l which cross the 255/256 boundary + * are added to a separate list. (These are valid + * only when the locale is UTF-8.) */ + if (c < 256 && LOC) { + *use_list = add_cp_to_invlist(*use_list, c); continue; } - /* Folds involving non-ascii Latin1 characters - * under /d are added to a separate list */ if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) { cp_list = add_cp_to_invlist(cp_list, c); } else { - depends_list = add_cp_to_invlist(depends_list, c); + /* Similarly folds involving non-ascii Latin1 + * characters under /d are added to their list */ + depends_list = add_cp_to_invlist(depends_list, + c); } } } @@ -14430,18 +14705,30 @@ parseit: /* If we didn't do folding, it's because some information isn't available * until runtime; set the run-time fold flag for these. (We don't have to * worry about properties folding, as that is taken care of by the swash - * fetching) */ - if (FOLD && LOC) - { - ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + * fetching). We know to set the flag if we have a non-NULL list for UTF-8 + * locales, or the class matches at least one 0-255 range code point */ + if (LOC && FOLD) { + if (only_utf8_locale_list) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + else if (cp_list) { /* Look to see if there a 0-255 code point is in + the list */ + UV start, end; + invlist_iterinit(cp_list); + if (invlist_iternext(cp_list, &start, &end) && start < 256) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + invlist_iterfinish(cp_list); + } } /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known * at compile time. Besides not inverting folded locale now, we can't * invert if there are things such as \w, which aren't known until runtime * */ - if (invert - && ! (ANYOF_FLAGS(ret) & (ANYOF_LOC_FOLD|ANYOF_POSIXL)) + if (cp_list + && invert + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) && ! depends_list && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) { @@ -14489,13 +14776,13 @@ parseit: if (cp_list && ! invert && ! depends_list - && ! (ANYOF_FLAGS(ret) & ANYOF_POSIXL) + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION /* We don't optimize if we are supposed to make sure all non-Unicode * code points raise a warning, as only ANYOF nodes have this check. * */ - && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER)) + && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER)) { UV start, end; U8 op = END; /* The optimzation node-type */ @@ -14588,7 +14875,9 @@ parseit: RExC_parse = (char *)cur_parse; if (PL_regkind[op] == EXACT) { - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); } SvREFCNT_dec_NN(cp_list); @@ -14619,6 +14908,7 @@ parseit: else { cp_list = depends_list; } + ANYOF_FLAGS(ret) |= ANYOF_UTF8; } /* If there is a swash and more than one element, we can't use the swash in @@ -14631,9 +14921,15 @@ parseit: set_ANYOF_arg(pRExC_state, ret, cp_list, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) ? listsv : NULL, + only_utf8_locale_list, swash, has_user_defined_property); *flagp |= HASWIDTH|SIMPLE; + + if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) { + RExC_contains_locale = 1; + } + return ret; } @@ -14644,6 +14940,7 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, regnode* const node, SV* const cp_list, SV* const runtime_defns, + SV* const only_utf8_locale_list, SV* const swash, const bool has_user_defined_property) { @@ -14661,37 +14958,52 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, * av[1] if &PL_sv_undef, is a placeholder to later contain the swash * computed from av[0]. But if no further computation need be done, * the swash is stored here now (and av[0] is &PL_sv_undef). - * av[2] stores the cp_list inversion list for use in addition or instead + * av[2] stores the inversion list of code points that match only if the + * current locale is UTF-8 + * av[3] stores the cp_list inversion list for use in addition or instead * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef. * (Otherwise everything needed is already in av[0] and av[1]) - * av[3] is set if any component of the class is from a user-defined - * property; used only if av[2] exists */ + * av[4] is set if any component of the class is from a user-defined + * property; used only if av[3] exists */ UV n; PERL_ARGS_ASSERT_SET_ANYOF_ARG; - if (! cp_list && ! runtime_defns) { + if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) { + assert(! (ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8))); ARG_SET(node, ANYOF_NONBITMAP_EMPTY); } else { AV * const av = newAV(); SV *rv; + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); + av_store(av, 0, (runtime_defns) ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef); if (swash) { + assert(cp_list); av_store(av, 1, swash); SvREFCNT_dec_NN(cp_list); } else { av_store(av, 1, &PL_sv_undef); if (cp_list) { - av_store(av, 2, cp_list); - av_store(av, 3, newSVuv(has_user_defined_property)); + av_store(av, 3, cp_list); + av_store(av, 4, newSVuv(has_user_defined_property)); } } + if (only_utf8_locale_list) { + av_store(av, 2, only_utf8_locale_list); + } + else { + av_store(av, 2, &PL_sv_undef); + } + rv = newRV_noinc(MUTABLE_SV(av)); n = add_data(pRExC_state, STR_WITH_LEN("s")); RExC_rxi->data->data[n] = (void*)rv; @@ -14702,35 +15014,34 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, /* reg_skipcomment() - Absorbs an /x style # comments from the input stream. - Returns true if there is more text remaining in the stream. - Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment - terminates the pattern without including a newline. + Absorbs an /x style # comment from the input stream, + returning a pointer to the first character beyond the comment, or if the + comment terminates the pattern without anything following it, this returns + one past the final character of the pattern (in other words, RExC_end) and + sets the REG_RUN_ON_COMMENT_SEEN flag. - Note its the callers responsibility to ensure that we are + Note it's the callers responsibility to ensure that we are actually in /x mode */ -STATIC bool -S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) +PERL_STATIC_INLINE char* +S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state, char* p) { - bool ended = 0; - PERL_ARGS_ASSERT_REG_SKIPCOMMENT; - while (RExC_parse < RExC_end) - if (*RExC_parse++ == '\n') { - ended = 1; - break; + assert(*p = '#'); + + while (p < RExC_end) { + if (*(++p) == '\n') { + return p+1; } - if (!ended) { - /* we ran off the end of the pattern without ending - the comment, so we have to add an \n when wrapping */ - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; - return 0; - } else - return 1; + } + + /* we ran off the end of the pattern without ending the comment, so we have + * to add an \n when wrapping */ + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; + return p; } /* nextchar() @@ -14768,16 +15079,14 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) continue; } if (RExC_flags & RXf_PMf_EXTENDED) { - if (isSPACE(*RExC_parse)) { - RExC_parse++; - continue; - } - else if (*RExC_parse == '#') { - if ( reg_skipcomment( pRExC_state ) ) - continue; - } + char * p = regpatws(pRExC_state, RExC_parse, + TRUE); /* means recognize comments */ + if (p != RExC_parse) { + RExC_parse = p; + continue; + } } - return retval; + return retval; } } @@ -15013,7 +15322,7 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, DEBUG_PARSE_r({ SV * const mysv=sv_newmortal(); DEBUG_PARSE_MSG((scan==p ? "tail" : "")); - regprop(RExC_rx, mysv, scan); + regprop(RExC_rx, mysv, scan, NULL); PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), (temp == NULL ? "->" : ""), @@ -15075,9 +15384,9 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode * const temp = regnext(scan); #ifdef EXPERIMENTAL_INPLACESCAN if (PL_regkind[OP(scan)] == EXACT) { - bool has_exactf_sharp_s; /* Unexamined in this routine */ + bool unfolded_multi_char; /* Unexamined in this routine */ if (join_exact(pRExC_state, scan, &min, - &has_exactf_sharp_s, 1, val, depth+1)) + &unfolded_multi_char, 1, val, depth+1)) return EXACT; } #endif @@ -15103,7 +15412,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, DEBUG_PARSE_r({ SV * const mysv=sv_newmortal(); DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); - regprop(RExC_rx, mysv, scan); + regprop(RExC_rx, mysv, scan, NULL); PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), @@ -15116,7 +15425,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, DEBUG_PARSE_r({ SV * const mysv_val=sv_newmortal(); DEBUG_PARSE_MSG(""); - regprop(RExC_rx, mysv_val, val); + regprop(RExC_rx, mysv_val, val, NULL); PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n", SvPV_nolen_const(mysv_val), @@ -15146,7 +15455,9 @@ S_regdump_intflags(pTHX_ const char *lead, const U32 flags) int bit; int set=0; - for (bit=0; bit<32; bit++) { + ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8); + + for (bit=0; bitcheck_substr == r->float_substr && r->check_utf8 == r->float_utf8 ? "(checking floating" : "(checking anchored")); - if (r->extflags & RXf_NOSCAN) + if (r->intflags & PREGf_NOSCAN) PerlIO_printf(Perl_debug_log, " noscan"); if (r->extflags & RXf_CHECK_ALL) PerlIO_printf(Perl_debug_log, " isall"); @@ -15268,22 +15581,22 @@ Perl_regdump(pTHX_ const regexp *r) PerlIO_printf(Perl_debug_log, ") "); if (ri->regstclass) { - regprop(r, sv, ri->regstclass); + regprop(r, sv, ri->regstclass, NULL); PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); } - if (r->extflags & RXf_ANCH) { + if (r->intflags & PREGf_ANCH) { PerlIO_printf(Perl_debug_log, "anchored"); - if (r->extflags & RXf_ANCH_BOL) + if (r->intflags & PREGf_ANCH_BOL) PerlIO_printf(Perl_debug_log, "(BOL)"); - if (r->extflags & RXf_ANCH_MBOL) + if (r->intflags & PREGf_ANCH_MBOL) PerlIO_printf(Perl_debug_log, "(MBOL)"); - if (r->extflags & RXf_ANCH_SBOL) + if (r->intflags & PREGf_ANCH_SBOL) PerlIO_printf(Perl_debug_log, "(SBOL)"); - if (r->extflags & RXf_ANCH_GPOS) + if (r->intflags & PREGf_ANCH_GPOS) PerlIO_printf(Perl_debug_log, "(GPOS)"); PerlIO_putc(Perl_debug_log, ' '); } - if (r->extflags & RXf_GPOS_SEEN) + if (r->intflags & PREGf_GPOS_SEEN) PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); if (r->intflags & PREGf_SKIP) PerlIO_printf(Perl_debug_log, "plus "); @@ -15305,11 +15618,11 @@ Perl_regdump(pTHX_ const regexp *r) } /* -- regprop - printable representation of opcode +- regprop - printable representation of opcode, with run time support */ void -Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) +Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo) { #ifdef DEBUGGING dVAR; @@ -15411,7 +15724,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) (UV)trie->maxlen, (UV)TRIE_CHARCOUNT(trie), (UV)trie->uniquecharcount - ) + ); ); if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { sv_catpvs(sv, "["); @@ -15454,6 +15767,20 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } } } + if ( k == REF && reginfo) { + U32 n = ARG(o); /* which paren pair */ + I32 ln = prog->offs[n].start; + if (prog->lastparen < n || ln == -1) + Perl_sv_catpvf(aTHX_ sv, ": FAIL"); + else if (ln == prog->offs[n].end) + Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); + else { + const char *s = reginfo->strbeg + ln; + Perl_sv_catpvf(aTHX_ sv, ": "); + Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0, + PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); + } + } } else if (k == GOSUB) /* Paren and offset */ Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); @@ -15469,7 +15796,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) int do_sep = 0; - if (flags & ANYOF_LOCALE) + if (flags & ANYOF_LOCALE_FLAGS) sv_catpvs(sv, "{loc}"); if (flags & ANYOF_LOC_FOLD) sv_catpvs(sv, "{i}"); @@ -15492,8 +15819,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } } - if (flags & (ANYOF_ABOVE_LATIN1_ALL|ANYOF_ABOVE_LATIN1_ALL) - || ANYOF_NONBITMAP(o)) + if ((flags & (ANYOF_ABOVE_LATIN1_ALL + |ANYOF_UTF8 + |ANYOF_NONBITMAP_NON_UTF8 + |ANYOF_LOC_FOLD))) { if (do_sep) { Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); @@ -15509,13 +15838,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) /* output information about the unicode matching */ if (flags & ANYOF_ABOVE_LATIN1_ALL) sv_catpvs(sv, "{unicode_all}"); - else if (ANYOF_NONBITMAP(o)) { + else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) { SV *lv; /* Set if there is something outside the bit map. */ bool byte_output = FALSE; /* If something in the bitmap has been output */ + SV *only_utf8_locale; /* Get the stuff that wasn't in the bitmap */ - (void) regclass_swash(prog, o, FALSE, &lv, NULL); + (void) _get_regclass_nonbitmap_data(prog, o, FALSE, + &lv, &only_utf8_locale); if (lv && lv != &PL_sv_undef) { char *s = savesvpv(lv); char * const origs = s; @@ -15566,6 +15897,27 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Safefree(origs); SvREFCNT_dec_NN(lv); } + + if ((flags & ANYOF_LOC_FOLD) + && only_utf8_locale + && only_utf8_locale != &PL_sv_undef) + { + UV start, end; + int max_entries = 256; + + sv_catpvs(sv, "{utf8 locale}"); + invlist_iterinit(only_utf8_locale); + while (invlist_iternext(only_utf8_locale, + &start, &end)) { + put_range(sv, start, end); + max_entries --; + if (max_entries < 0) { + sv_catpvs(sv, "..."); + break; + } + } + invlist_iterfinish(only_utf8_locale); + } } } @@ -15573,10 +15925,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } else if (k == POSIXD || k == NPOSIXD) { U8 index = FLAGS(o) * 2; - if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) { - Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); - } - else { + if (index < C_ARRAY_LENGTH(anyofs)) { if (*anyofs[index] != '[') { sv_catpv(sv, "["); } @@ -15585,6 +15934,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) sv_catpv(sv, "]"); } } + else { + Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); + } } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); @@ -15593,9 +15945,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) PERL_UNUSED_ARG(sv); PERL_UNUSED_ARG(o); PERL_UNUSED_ARG(prog); + PERL_UNUSED_ARG(reginfo); #endif /* DEBUGGING */ } + + SV * Perl_re_intuit_string(pTHX_ REGEXP * const r) { /* Assume that RE_INTUIT is set */ @@ -15837,7 +16192,16 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) PerlMemShared_free(aho->fail); /* do this last!!!! */ PerlMemShared_free(ri->data->data[n]); - PerlMemShared_free(ri->regstclass); + /* we should only ever get called once, so + * assert as much, and also guard the free + * which /might/ happen twice. At the least + * it will make code anlyzers happy and it + * doesn't cost much. - Yves */ + assert(ri->regstclass); + if (ri->regstclass) { + PerlMemShared_free(ri->regstclass); + ri->regstclass = 0; + } } } break; @@ -16051,12 +16415,12 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) * when the corresponding reg_ac_data struct is freed. */ reti->regstclass= ri->regstclass; - /* Fall through */ + /* FALLTHROUGH */ case 't': OP_REFCNT_LOCK; ((reg_trie_data*)ri->data->data[i])->refcount++; OP_REFCNT_UNLOCK; - /* Fall through */ + /* FALLTHROUGH */ case 'l': case 'L': d->data[i] = ri->data->data[i]; @@ -16317,6 +16681,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, last= plast; while (PL_regkind[op] != END && (!last || node < last)) { + assert(node); /* While that wasn't END last time... */ NODE_ALIGN(node); op = OP(node); @@ -16333,7 +16698,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } else CLEAR_OPTSTART; - regprop(r, sv, node); + regprop(r, sv, node, NULL); PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), (int)(2*indent + 1), "", SvPVX_const(sv)); @@ -16431,7 +16796,8 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, else if (PL_regkind[(U8)op] == ANYOF) { /* arglen 1 + class block */ node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL) - ? ANYOF_POSIXL_SKIP : ANYOF_SKIP); + ? ANYOF_POSIXL_SKIP + : ANYOF_SKIP); node = NEXTOPER(node); } else if (PL_regkind[(U8)op] == EXACT) {