X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/19bb2455f25f18445e3f4f966a5ee04a8fea6b62..6987f4434e4dfee71506125954ee1ae41c46f1cb:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 6794650..0c8beb0 100644 --- a/regcomp.c +++ b/regcomp.c @@ -143,13 +143,6 @@ EXTERN_C const struct regexp_engine wild_reg_engine; #include "invlist_inline.h" #include "unicode_constants.h" -#define HAS_NONLATIN1_FOLD_CLOSURE(i) \ - _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) -#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \ - _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) -#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) -#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) - #ifndef STATIC #define STATIC static #endif @@ -163,6 +156,7 @@ typedef struct scan_frame { regnode *next_regnode; /* next node to process when last is reached */ U32 prev_recursed_depth; I32 stopparen; /* what stopparen do we use */ + bool in_gosub; /* this or an outer frame is for GOSUB */ struct scan_frame *this_prev_frame; /* this previous frame */ struct scan_frame *prev_frame; /* previous frame */ @@ -241,8 +235,7 @@ struct RExC_state_t { U8 *study_chunk_recursed; /* bitmap of which subs we have moved through */ U32 study_chunk_recursed_bytes; /* bytes in bitmap */ - I32 in_lookbehind; - I32 in_lookahead; + I32 in_lookaround; I32 contains_locale; I32 override_recoding; I32 recode_x_to_native; @@ -329,8 +322,7 @@ struct RExC_state_t { #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) #define RExC_study_chunk_recursed_bytes \ (pRExC_state->study_chunk_recursed_bytes) -#define RExC_in_lookbehind (pRExC_state->in_lookbehind) -#define RExC_in_lookahead (pRExC_state->in_lookahead) +#define RExC_in_lookaround (pRExC_state->in_lookaround) #define RExC_contains_locale (pRExC_state->contains_locale) #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native) @@ -369,22 +361,15 @@ struct RExC_state_t { RExC_naughty += RExC_naughty / (exp) + (add) #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') -#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ - ((*s) == '{' && regcurly(s))) +#define ISMULT2(s) (ISMULT1(*s) || ((*s) == '{' && regcurly(s))) /* * Flags to be passed up and down. */ -#define WORST 0 /* Worst case. */ #define HASWIDTH 0x01 /* Known to not match null strings, could match non-null ones. */ - -/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single - * character. (There needs to be a case: in the switch statement in regexec.c - * for any node marked SIMPLE.) Note that this is not the same thing as - * REGNODE_SIMPLE */ -#define SIMPLE 0x02 -#define SPSTART 0x04 /* Starts with * or + */ +#define SIMPLE 0x02 /* Exactly one character wide */ + /* (or LNBREAK as a special case) */ #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */ #define TRYAGAIN 0x10 /* Weeded out a declaration. */ #define RESTART_PARSE 0x20 /* Need to redo the parse */ @@ -420,6 +405,11 @@ struct RExC_state_t { } \ } STMT_END +/* /u is to be chosen if we are supposed to use Unicode rules, or if the + * pattern is in UTF-8. This latter condition is in case the outermost rules + * are locale. See GH #17278 */ +#define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF) + /* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is * a flag that indicates we need to override /d with /u as a result of * something in the pattern. It should only be used in regards to calling @@ -1497,15 +1487,15 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, ? OPTIMIZE_INFTY : (l ? data->last_start_max + /* temporary underflow guard for 5.32 */ + : data->pos_delta < 0 ? OPTIMIZE_INFTY : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta)); } - if (data->flags & SF_BEFORE_EOL) - data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL); - else - data->substrs[i].flags &= ~SF_BEFORE_EOL; + data->substrs[i].flags &= ~SF_BEFORE_EOL; + data->substrs[i].flags |= data->flags & SF_BEFORE_EOL; data->substrs[i].minlenp = minlenp; data->substrs[i].lookbehind = 0; } @@ -1660,7 +1650,6 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, * returned list must, and will, contain every code point that is a * possibility. */ - dVAR; SV* invlist = NULL; SV* only_utf8_locale_invlist = NULL; unsigned int i; @@ -2137,8 +2126,6 @@ S_ssc_clear_locale(regnode_ssc *ssc) ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; } -#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C - STATIC bool S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc) { @@ -2948,11 +2935,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, /* See if *uc is the beginning of a multi-character fold. If * so, we decrement the length remaining to look at, to account * for the current character this iteration. (We can use 'uc' - * instead of the fold returned by TRIE_READ_CHAR because for - * non-UTF, the latin1_safe macro is smart enough to account - * for all the unfolded characters, and because for UTF, the - * string will already have been folded earlier in the - * compilation process */ + * instead of the fold returned by TRIE_READ_CHAR because the + * macro is smart enough to account for any unfolded + * characters. */ if (UTF) { if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { foldlen -= UTF8SKIP(uc); @@ -4004,7 +3989,7 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour * character folded sequences. Since a single character can fold into * such a sequence, the minimum match length for this node is less than * the number of characters in the node. This routine returns in - * *min_subtract how many characters to subtract from the the actual + * *min_subtract how many characters to subtract from the actual * length of the string to get a real minimum match length; it is 0 if * there are no multi-char foldeds. This delta is used by the caller to * adjust the min length of the match, and the delta between min and max, @@ -4442,7 +4427,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, s++; } } - else { + else if (OP(scan) != EXACTFAA_NO_TRIE) { /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char * folds that are all Latin1. As explained in the comments @@ -4474,7 +4459,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, /* EXACTF nodes need to know that the minimum length * changed so that a sharp s in the string can match this * ss in the pattern, but they remain EXACTF nodes, as they - * won't match this unless the target string is is UTF-8, + * won't match this unless the target string is in UTF-8, * which we don't know until runtime. EXACTFL nodes can't * transform into EXACTFU nodes */ if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { @@ -4527,6 +4512,42 @@ S_unwind_scan_frames(pTHX_ const void *p) } while (f); } +/* Follow the next-chain of the current node and optimize away + all the NOTHINGs from it. + */ +STATIC void +S_rck_elide_nothing(pTHX_ regnode *node) +{ + PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING; + + if (OP(node) != CURLYX) { + const int max = (reg_off_by_arg[OP(node)] + ? I32_MAX + /* I32 may be smaller than U16 on CRAYs! */ + : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); + int off = (reg_off_by_arg[OP(node)] ? ARG(node) : NEXT_OFF(node)); + int noff; + regnode *n = node; + + /* Skip NOTHING and LONGJMP. */ + while ( + (n = regnext(n)) + && ( + (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n))) + || ((OP(n) == LONGJMP) && (noff = ARG(n))) + ) + && off + noff < max + ) { + off += noff; + } + if (reg_off_by_arg[OP(node)]) + ARG(node) = off; + else + NEXT_OFF(node) = off; + } + return; +} + /* the return from this sub is the minimum length that could possibly match */ STATIC SSize_t S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, @@ -4536,7 +4557,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 stopparen, U32 recursed_depth, regnode_ssc *and_withp, - U32 flags, U32 depth) + U32 flags, U32 depth, bool was_mutate_ok) /* scanp: Start here (read-write). */ /* deltap: Write maxlen-minlen here. */ /* last: Stop before this one. */ @@ -4545,7 +4566,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* recursed: which subroutines have we recursed into */ /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ { - dVAR; SSize_t final_minlen; /* There must be at least this number of characters to match */ SSize_t min = 0; @@ -4608,6 +4628,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, node length to get a real minimum (because the folded version may be shorter) */ bool unfolded_multi_char = FALSE; + /* avoid mutating ops if we are anywhere within the recursed or + * enframed handling for a GOSUB: the outermost level will handle it. + */ + bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub); /* Peephole optimizer: */ DEBUG_STUDYDATA("Peep", data, depth, is_inf); DEBUG_PEEP("Peep", scan, depth, flags); @@ -4618,33 +4642,19 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * parsing code, as each (?:..) is handled by a different invocation of * reg() -- Yves */ - if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT - && OP(scan) != LEXACT_REQ8) + if (PL_regkind[OP(scan)] == EXACT + && OP(scan) != LEXACT + && OP(scan) != LEXACT_REQ8 + && mutate_ok + ) { join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char, 0, NULL, depth + 1); + } /* Follow the next-chain of the current node and optimize - away all the NOTHINGs from it. */ - if (OP(scan) != CURLYX) { - const int max = (reg_off_by_arg[OP(scan)] - ? I32_MAX - /* I32 may be smaller than U16 on CRAYs! */ - : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); - int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); - int noff; - regnode *n = scan; - - /* Skip NOTHING and LONGJMP. */ - while ( (n = regnext(n)) - && ( (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n))) - || ((OP(n) == LONGJMP) && (noff = ARG(n)))) - && off + noff < max) - off += noff; - if (reg_off_by_arg[OP(scan)]) - ARG(scan) = off; - else - NEXT_OFF(scan) = off; - } + away all the NOTHINGs from it. + */ + rck_elide_nothing(scan); /* The principal pseudo-switch. Cannot be a switch, since we look into * several different things. */ @@ -4671,7 +4681,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* DEFINEP study_chunk() recursion */ (void)study_chunk(pRExC_state, &scan, &minlen, &deltanext, next, &data_fake, stopparen, - recursed_depth, NULL, f, depth+1); + recursed_depth, NULL, f, depth+1, mutate_ok); scan = next; } else @@ -4739,7 +4749,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* recurse study_chunk() for each BRANCH in an alternation */ minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, next, &data_fake, stopparen, - recursed_depth, NULL, f, depth+1); + recursed_depth, NULL, f, depth+1, + mutate_ok); if (min1 > minnext) min1 = minnext; @@ -4806,9 +4817,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } - if (PERL_ENABLE_TRIE_OPTIMISATION && - OP( startbranch ) == BRANCH ) - { + if (PERL_ENABLE_TRIE_OPTIMISATION + && OP(startbranch) == BRANCH + && mutate_ok + ) { /* demq. Assuming this was/is a branch we are dealing with: 'scan' @@ -5188,7 +5200,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * might result in a minlen of 1 and not of 4, * but this doesn't make us mismatch, just try a bit * harder than we should. - * */ + * + * However we must assume this GOSUB is infinite, to + * avoid wrongly applying other optimizations in the + * enclosing scope - see GH 18096, for example. + */ + is_inf = is_inf_internal = 1; scan= regnext(scan); continue; } @@ -5261,6 +5278,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, newframe->stopparen = stopparen; newframe->prev_recursed_depth = recursed_depth; newframe->this_prev_frame= frame; + newframe->in_gosub = ( + (frame && frame->in_gosub) || OP(scan) == GOSUB + ); DEBUG_STUDYDATA("frame-new", data, depth, is_inf); DEBUG_PEEP("fnew", scan, depth, flags); @@ -5275,12 +5295,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, continue; } } - else if ( OP(scan) == EXACT - || OP(scan) == LEXACT - || OP(scan) == EXACT_REQ8 - || OP(scan) == LEXACT_REQ8 - || OP(scan) == EXACTL) - { + else if (PL_regkind[OP(scan)] == EXACT && ! isEXACTFish(OP(scan))) { SSize_t bytelen = STR_LEN(scan), charlen; UV uc; assert(bytelen); @@ -5298,8 +5313,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, offset, later match for variable offset. */ if (data->last_end == -1) { /* Update the start info. */ data->last_start_min = data->pos_min; - data->last_start_max = is_inf - ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta; + data->last_start_max = + is_inf ? OPTIMIZE_INFTY + : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min) + ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta; } sv_catpvn(data->last_found, STRING(scan), bytelen); if (UTF) @@ -5345,8 +5362,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && isALPHA_A(*s) && ( OP(scan) == EXACTFAA || ( OP(scan) == EXACTFU - && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))) - { + && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s))) + && mutate_ok + ) { U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */ OP(scan) = ANYOFM; @@ -5416,11 +5434,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, case PLUS: if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { next = NEXTOPER(scan); - if ( OP(next) == EXACT - || OP(next) == LEXACT - || OP(next) == EXACT_REQ8 - || OP(next) == LEXACT_REQ8 - || OP(next) == EXACTL + if ( ( PL_regkind[OP(next)] == EXACT + && ! isEXACTFish(OP(next))) || (flags & SCF_DO_STCLASS)) { mincount = 1; @@ -5432,6 +5447,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } if (flags & SCF_DO_SUBSTR) data->pos_min++; + /* This will bypass the formal 'min += minnext * mincount' + * calculation in the do_curly path, so assumes min width + * of the PLUS payload is exactly one. */ min++; /* FALLTHROUGH */ case STAR: @@ -5439,7 +5457,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* This temporary node can now be turned into EXACTFU, and * must, as regexec.c doesn't handle it */ - if (OP(next) == EXACTFU_S_EDGE) { + if (OP(next) == EXACTFU_S_EDGE && mutate_ok) { OP(next) = EXACTFU; } @@ -5447,8 +5465,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && isALPHA_A(* STRING(next)) && ( OP(next) == EXACTFAA || ( OP(next) == EXACTFU - && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))) - { + && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))) + && mutate_ok + ) { /* These differ in just one bit */ U8 mask = ~ ('A' ^ 'a'); @@ -5535,7 +5554,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, (mincount == 0 ? (f & ~SCF_DO_SUBSTR) : f) - ,depth+1); + , depth+1, mutate_ok); if (flags & SCF_DO_STCLASS) data->start_class = oclass; @@ -5581,6 +5600,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, RExC_precomp))); } + if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext ) + || min >= SSize_t_MAX - minnext * mincount ) + { + FAIL("Regexp out of space"); + } + min += minnext * mincount; is_inf_internal |= deltanext == OPTIMIZE_INFTY || (maxcount == REG_INFTY && minnext + deltanext > 0); @@ -5595,7 +5620,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( OP(oscan) == CURLYX && data && data->flags & SF_IN_PAR && !(data->flags & SF_HAS_EVAL) - && !deltanext && minnext == 1 ) { + && !deltanext && minnext == 1 + && mutate_ok + ) { /* Try to optimize to CURLYN. */ regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; regnode * const nxt1 = nxt; @@ -5645,10 +5672,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && !(data->flags & SF_HAS_EVAL) && !deltanext /* atom is fixed width */ && minnext != 0 /* CURLYM can't handle zero width */ - /* Nor characters whose fold at run-time may be * multi-character */ && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN) + && mutate_ok ) { /* XXXX How to optimize if data == 0? */ /* Optimize to a simpler form. */ @@ -5701,7 +5728,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* recurse study_chunk() on optimised CURLYX => CURLYM */ study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, NULL, stopparen, recursed_depth, NULL, 0, - depth+1); + depth+1, mutate_ok); } else oscan->flags = 0; @@ -5831,11 +5858,7 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", if (data && (fl & SF_HAS_EVAL)) data->flags |= SF_HAS_EVAL; optimize_curly_tail: - if (OP(oscan) != CURLYX) { - while (PL_regkind[OP(next = regnext(oscan))] == NOTHING - && NEXT_OFF(next)) - NEXT_OFF(oscan) += NEXT_OFF(next); - } + rck_elide_nothing(oscan); continue; default: @@ -5966,7 +5989,10 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", (regnode_charclass *) scan); break; - case NANYOFM: + case NANYOFM: /* NANYOFM already contains the inversion of the + input ANYOF data, so, unlike things like + NPOSIXA, don't change 'invert' to TRUE */ + /* FALLTHROUGH */ case ANYOFM: { SV* cp_list = get_ANYOFM_contents(scan); @@ -6131,7 +6157,8 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", /* recurse study_chunk() for lookahead body */ minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, last, &data_fake, stopparen, - recursed_depth, NULL, f, depth+1); + recursed_depth, NULL, f, depth+1, + mutate_ok); if (scan->flags) { if ( deltanext < 0 || deltanext > (I32) U8_MAX @@ -6236,7 +6263,7 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, last, &data_fake, stopparen, recursed_depth, NULL, - f, depth+1); + f, depth+1, mutate_ok); if (scan->flags) { assert(0); /* This code has never been tested since this is normally not compiled */ @@ -6403,7 +6430,8 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", /* optimise study_chunk() for TRIE */ minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, (regnode *)nextbranch, &data_fake, - stopparen, recursed_depth, NULL, f, depth+1); + stopparen, recursed_depth, NULL, f, depth+1, + mutate_ok); } if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) nextbranch= regnext((regnode*)nextbranch); @@ -6876,7 +6904,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, /* we make the assumption here that each op in the list of * op_siblings maps to one SV pushed onto the stack, * except for code blocks, with have both an OP_NULL and - * and OP_CONST. + * an OP_CONST. * This allows us to match up the list of SVs against the * list of OPs to find the next code block. * @@ -6926,7 +6954,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, pRExC_state->code_blocks->count -= n; n = 0; } - else { + else { /* ... or failing that, try "" overload */ while (SvAMAGIC(msv) && (sv = AMG_CALLunary(msv, string_amg)) @@ -7369,7 +7397,7 @@ S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx) PERL_ARGS_ASSERT_SET_REGEX_PV; /* make sure PL_bitcount bounds not exceeded */ - assert(sizeof(STD_PAT_MODS) <= 8); + STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8); p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */ SvPOK_on(Rx); @@ -7463,7 +7491,7 @@ S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx) * length of the pattern. Patches welcome to improve that guess. That amount * of space is malloc'd and then immediately freed, and then clawed back node * by node. This design is to minimze, to the extent possible, memory churn - * when doing the the reallocs. + * when doing the reallocs. * * A separate parentheses counting pass may be needed in some cases. * (Previously the sizing pass did this.) Patches welcome to reduce the number @@ -7481,7 +7509,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, OP *expr, const regexp_engine* eng, REGEXP *old_re, bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags) { - dVAR; REGEXP *Rx; /* Capital 'R' means points to a REGEXP */ STRLEN plen; char *exp; @@ -7703,7 +7730,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, rx_flags = orig_rx_flags; - if ( (UTF || RExC_uni_semantics) + if ( toUSE_UNI_CHARSET_NOT_DEPENDS && initial_charset == REGEX_DEPENDS_CHARSET) { @@ -7734,8 +7761,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_seen = 0; RExC_maxlen = 0; - RExC_in_lookbehind = 0; - RExC_in_lookahead = 0; + RExC_in_lookaround = 0; RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_recode_x_to_native = 0; RExC_in_multi_char_class = 0; @@ -8080,12 +8106,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, DEBUG_PEEP("first:", first, 0, 0); /* Ignore EXACT as we deal with it later. */ if (PL_regkind[OP(first)] == EXACT) { - if ( OP(first) == EXACT - || OP(first) == LEXACT - || OP(first) == EXACT_REQ8 - || OP(first) == LEXACT_REQ8 - || OP(first) == EXACTL) - { + if (! isEXACTFish(OP(first))) { NOOP; /* Empty, get anchored substr later. */ } else @@ -8191,7 +8212,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, &data, -1, 0, NULL, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag | (restudied ? SCF_TRIE_DOING_RESTUDY : 0), - 0); + 0, TRUE); CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk")); @@ -8320,7 +8341,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied ? SCF_TRIE_DOING_RESTUDY : 0), - 0); + 0, TRUE); CHECK_RESTUDY_GOTO_butfirst(NOOP); @@ -8429,9 +8450,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, && nop == END) RExC_rx->extflags |= RXf_WHITE; else if ( RExC_rx->extflags & RXf_SPLIT - && ( fop == EXACT || fop == LEXACT - || fop == EXACT_REQ8 || fop == LEXACT_REQ8 - || fop == EXACTL) + && (PL_regkind[fop] == EXACT && ! isEXACTFish(fop)) && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && nop == END ) @@ -8675,9 +8694,9 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) } else if (flags & RXapif_ONE) { ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); av = MUTABLE_AV(SvRV(ret)); - length = av_tindex(av); + length = av_count(av); SvREFCNT_dec_NN(ret); - return newSViv(length + 1); + return newSViv(length); } else { Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags); @@ -9700,7 +9719,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, * one of them */ while (i_a < len_a && i_b < len_b) { UV cp; /* The element to potentially add to the union's array */ - bool cp_in_set; /* is it in the the input list's set or not */ + bool cp_in_set; /* is it in the input list's set or not */ /* We need to take one or the other of the two inputs for the union. * Since we are merging two sorted lists, we take the smaller of the @@ -10597,7 +10616,6 @@ Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) STATIC SV* S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) { - dVAR; const U8 * s = (U8*)STRING(node); SSize_t bytelen = STR_LEN(node); UV uc; @@ -10643,8 +10661,8 @@ S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) /* Some characters match above-Latin1 ones under /i. This * is true of EXACTFL ones when the locale is UTF-8 */ if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc) - && (! isASCII(uc) || (OP(node) != EXACTFAA - && OP(node) != EXACTFAA_NO_TRIE))) + && (! isASCII(uc) || ! inRANGE(OP(node), EXACTFAA, + EXACTFAA_NO_TRIE))) { add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist); } @@ -10658,7 +10676,7 @@ S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL); /* The only code points that aren't folded in a UTF EXACTFish - * node are are the problematic ones in EXACTFL nodes */ + * node are the problematic ones in EXACTFL nodes */ if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) { /* We need to check for the possibility that this EXACTFL * node begins with a multi-char fold. Therefore we fold @@ -10698,12 +10716,8 @@ S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) * the folded string to be just past any possible multi-char * fold. * - * Unlike the non-UTF-8 case, the macro for determining if a - * string is a multi-char fold requires all the characters to - * already be folded. This is because of all the complications - * if not. Note that they are folded anyway, except in EXACTFL - * nodes. Like the non-UTF case above, we punt if the node - * begins with a multi-char fold */ + * Like the non-UTF case above, we punt if the node begins with a + * multi-char fold */ if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) { invlist = _add_range_to_invlist(invlist, 0, UV_MAX); @@ -10725,7 +10739,7 @@ S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) UV c = (k == 0) ? first_fold : remaining_folds[k-1]; /* /aa doesn't allow folds between ASCII and non- */ - if ( (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE) + if ( inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE) && isASCII(c) != isASCII(fc)) { continue; @@ -10798,7 +10812,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) RExC_parse++; has_use_defaults = TRUE; STD_PMMOD_FLAGS_CLEAR(&RExC_flags); - cs = (RExC_uni_semantics) + cs = (toUSE_UNI_CHARSET_NOT_DEPENDS) ? REGEX_UNICODE_CHARSET : REGEX_DEPENDS_CHARSET; set_regex_charset(&RExC_flags, cs); @@ -10806,7 +10820,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) else { cs = get_regex_charset(RExC_flags); if ( cs == REGEX_DEPENDS_CHARSET - && RExC_uni_semantics) + && (toUSE_UNI_CHARSET_NOT_DEPENDS)) { cs = REGEX_UNICODE_CHARSET; } @@ -10890,7 +10904,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) * pattern (or target, not known until runtime) are * utf8, or something in the pattern indicates unicode * semantics */ - cs = (RExC_uni_semantics) + cs = (toUSE_UNI_CHARSET_NOT_DEPENDS) ? REGEX_UNICODE_CHARSET : REGEX_DEPENDS_CHARSET; has_charset_modifier = DEPENDS_PAT_MOD; @@ -11116,6 +11130,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) I32 after_freeze = 0; I32 num; /* numeric backreferences */ SV * max_open; /* Max number of unclosed parens */ + I32 was_in_lookaround = RExC_in_lookaround; char * parse_start = RExC_parse; /* MJD */ char * const oregcomp_parse = RExC_parse; @@ -11135,14 +11150,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) vFAIL("Too many nested open parens"); } - *flagp = 0; /* Tentatively. */ - - if (RExC_in_lookbehind) { - RExC_in_lookbehind++; - } - if (RExC_in_lookahead) { - RExC_in_lookahead++; - } + *flagp = 0; /* Initialize. */ /* Having this true makes it feasible to have a lot fewer tests for the * parse pointer being in scope. For example, we can write @@ -11397,11 +11405,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) lookbehind_alpha_assertions: RExC_seen |= REG_LOOKBEHIND_SEEN; - RExC_in_lookbehind++; /*FALLTHROUGH*/ alpha_assertions: + RExC_in_lookaround++; RExC_seen_zerolen++; if (! start_arg) { @@ -11472,6 +11480,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) bool is_logical = 0; const char * const seqstart = RExC_parse; const char * endptr; + const char non_existent_group_msg[] + = "Reference to nonexistent group"; + const char impossible_group[] = "Invalid reference to group"; + if (has_intervening_patws) { RExC_parse++; vFAIL("In '(?...)', the '(' and '?' must be adjacent"); @@ -11600,7 +11612,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) } RExC_seen |= REG_LOOKBEHIND_SEEN; - RExC_in_lookbehind++; + RExC_in_lookaround++; RExC_parse++; if (RExC_parse >= RExC_end) { vFAIL("Sequence (?... not terminated"); @@ -11609,7 +11621,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) break; case '=': /* (?=...) */ RExC_seen_zerolen++; - RExC_in_lookahead++; + RExC_in_lookaround++; break; case '!': /* (?!...) */ RExC_seen_zerolen++; @@ -11621,6 +11633,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) nextchar(pRExC_state); return ret; } + RExC_in_lookaround++; break; case '|': /* (?|...) */ /* branch reset, behave like a (?:...) except that @@ -11698,10 +11711,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) ) { num = (I32)unum; RExC_parse = (char*)endptr; - } else - num = I32_MAX; + } + else { /* Overflow, or something like that. Position + beyond all digits for the message */ + while (RExC_parse < RExC_end && isDIGIT(*RExC_parse)) { + RExC_parse++; + } + vFAIL(impossible_group); + } if (is_neg) { - /* Some limit for num? */ + /* -num is always representable on 1 and 2's complement + * machines */ num = -num; } } @@ -11709,45 +11729,43 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) vFAIL("Expecting close bracket"); gen_recurse_regop: - if ( paren == '-' ) { + if (paren == '-' || paren == '+') { + + /* Don't overflow */ + if (UNLIKELY(I32_MAX - RExC_npar < num)) { + RExC_parse++; + vFAIL(impossible_group); + } + /* Diagram of capture buffer numbering. Top line is the normal capture buffer numbers Bottom line is the negative indexing as from the X (the (?-2)) - + 1 2 3 4 5 X 6 7 + 1 2 3 4 5 X Y 6 7 + /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/ /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/ - - 5 4 3 2 1 X x x + - 5 4 3 2 1 X Y x x + Resolve to absolute group. Recall that RExC_npar is +1 of + the actual parenthesis group number. For lookahead, we + have to compensate for that. Using the above example, when + we get to Y in the parse, num is 2 and RExC_npar is 6. We + want 7 for +2, and 4 for -2. */ - num = RExC_npar + num; - if (num < 1) { + if ( paren == '+' ) { + num--; + } - /* It might be a forward reference; we can't fail until - * we know, by completing the parse to get all the - * groups, and then reparsing */ - if (ALL_PARENS_COUNTED) { - RExC_parse++; - vFAIL("Reference to nonexistent group"); - } - else { - REQUIRE_PARENS_PASS; - } + num += RExC_npar; + + if (paren == '-' && num < 1) { + RExC_parse++; + vFAIL(non_existent_group_msg); } - } else if ( paren == '+' ) { - num = RExC_npar + num - 1; } - /* We keep track how many GOSUB items we have produced. - To start off the ARG2L() of the GOSUB holds its "id", - which is used later in conjunction with RExC_recurse - to calculate the offset we need to jump for the GOSUB, - which it will store in the final representation. - We have to defer the actual calculation until much later - as the regop may move. - */ - ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count); if (num >= RExC_npar) { /* It might be a forward reference; we can't fail until we @@ -11756,13 +11774,23 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) if (ALL_PARENS_COUNTED) { if (num >= RExC_total_parens) { RExC_parse++; - vFAIL("Reference to nonexistent group"); + vFAIL(non_existent_group_msg); } } else { REQUIRE_PARENS_PASS; } } + + /* We keep track how many GOSUB items we have produced. + To start off the ARG2L() of the GOSUB holds its "id", + which is used later in conjunction with RExC_recurse + to calculate the offset we need to jump for the GOSUB, + which it will store in the final representation. + We have to defer the actual calculation until much later + as the regop may move. + */ + ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count); RExC_recurse_count++; DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ "%*s%*s Recurse #%" UVuf " to %" IVdf "\n", @@ -12210,7 +12238,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) } else if (paren != '?') /* Not Conditional */ ret = br; - *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); + *flagp |= flags & (HASWIDTH | POSTPONED); lastbr = br; while (*RExC_parse == '|') { if (RExC_use_BRANCHJ) { @@ -12240,7 +12268,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) REQUIRE_BRANCHJ(flagp, 0); } lastbr = br; - *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); + *flagp |= flags & (HASWIDTH | POSTPONED); } if (have_branch || paren != ':') { @@ -12407,7 +12435,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) /* restore original flags, but keep (?p) and, if we've encountered * something in the parse that changes /d rules into /u, keep the /u */ RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY); - if (DEPENDS_SEMANTICS && RExC_uni_semantics) { + if (DEPENDS_SEMANTICS && toUSE_UNI_CHARSET_NOT_DEPENDS) { set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); } if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') { @@ -12426,14 +12454,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) NOT_REACHED; /* NOTREACHED */ } - if (RExC_in_lookbehind) { - RExC_in_lookbehind--; - } - if (RExC_in_lookahead) { - RExC_in_lookahead--; - } if (after_freeze > RExC_npar) RExC_npar = after_freeze; + + RExC_in_lookaround = was_in_lookaround; + return(ret); } @@ -12473,7 +12498,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) } } - *flagp = WORST; /* Tentatively. */ + *flagp = 0; /* Initialize. */ skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE /* Don't force to /x */ ); @@ -12489,9 +12514,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) else if (ret == 0) ret = latest; *flagp |= flags&(HASWIDTH|POSTPONED); - if (chain == 0) /* First piece. */ - *flagp |= flags&SPSTART; - else { + if (chain != 0) { /* FIXME adding one for every branch after the first is probably * excessive now we have TRIE support. (hv) */ MARK_NAUGHTY(1); @@ -12518,6 +12541,30 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) } /* + - regcurly - a little FSA that accepts {\d+,?\d*} + Pulled from reg.c. + */ +bool +Perl_regcurly(const char *s) +{ + PERL_ARGS_ASSERT_REGCURLY; + + if (*s++ != '{') + return FALSE; + if (!isDIGIT(*s)) + return FALSE; + while (isDIGIT(*s)) + s++; + if (*s == ',') { + s++; + while (isDIGIT(*s)) + s++; + } + + return *s == '}'; +} + +/* - regpiece - something followed by possible quantifier * + ? {n,m} * * Note that the branching code sequences used for ? and the general cases @@ -12565,28 +12612,51 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags); } - op = *RExC_parse; - - if (op == '{' && regcurly(RExC_parse)) { - maxpos = NULL; #ifdef RE_TRACK_PATTERN_OFFSETS - parse_start = RExC_parse; /* MJD */ + parse_start = RExC_parse; #endif - next = RExC_parse + 1; - while (isDIGIT(*next) || *next == ',') { - if (*next == ',') { - if (maxpos) - break; - else - maxpos = next; - } - next++; - } - if (*next == '}') { /* got one */ + + op = *RExC_parse; + switch (op) { + + case '*': + nextchar(pRExC_state); + min = 0; + break; + + case '+': + nextchar(pRExC_state); + min = 1; + break; + + case '?': + nextchar(pRExC_state); + min = 0; max = 1; + break; + + case '{': /* A '{' may or may not indicate a quantifier; call regcurly() + to determine which */ + if (regcurly(RExC_parse)) { const char* endptr; - if (!maxpos) - maxpos = next; - RExC_parse++; + + /* Here is a quantifier, parse for min and max values */ + maxpos = NULL; + next = RExC_parse + 1; + while (isDIGIT(*next) || *next == ',') { + if (*next == ',') { + if (maxpos) + break; + else + maxpos = next; + } + next++; + } + + assert(*next == '}'); + + if (!maxpos) + maxpos = next; + RExC_parse++; if (isDIGIT(*RExC_parse)) { endptr = RExC_end; if (!grok_atoUV(RExC_parse, &uv, &endptr)) @@ -12597,10 +12667,10 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } else { min = 0; } - if (*maxpos == ',') - maxpos++; - else - maxpos = RExC_parse; + if (*maxpos == ',') + maxpos++; + else + maxpos = RExC_parse; if (isDIGIT(*maxpos)) { endptr = RExC_end; if (!grok_atoUV(maxpos, &uv, &endptr)) @@ -12609,10 +12679,11 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); max = (I32)uv; } else { - max = REG_INFTY; /* meaning "infinity" */ + max = REG_INFTY; /* meaning "infinity" */ } - RExC_parse = next; - nextchar(pRExC_state); + + RExC_parse = next; + nextchar(pRExC_state); if (max < min) { /* If can't match, warn and optimize to fail unconditionally */ reginsert(pRExC_state, OPFAIL, orig_emit, depth+1); @@ -12628,149 +12699,129 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *RExC_parse); } - do_curly: - if ((flags&SIMPLE)) { - if (min == 0 && max == REG_INFTY) { + break; + } /* End of is regcurly() */ - /* Going from 0..inf is currently forbidden in wildcard - * subpatterns. The only reason is to make it harder to - * write patterns that take a long long time to halt, and - * because the use of this construct isn't necessary in - * matching Unicode property values */ - if (RExC_pm_flags & PMf_WILDCARD) { - RExC_parse++; - /* diag_listed_as: Use of %s is not allowed in Unicode - property wildcard subpatterns in regex; marked by - <-- HERE in m/%s/ */ - vFAIL("Use of quantifier '*' is not allowed in" - " Unicode property wildcard subpatterns"); - /* Note, don't need to worry about {0,}, as a '}' isn't - * legal at all in wildcards, so wouldn't get this far - * */ - } - reginsert(pRExC_state, STAR, ret, depth+1); - MARK_NAUGHTY(4); - RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; - goto nest_check; - } - if (min == 1 && max == REG_INFTY) { - reginsert(pRExC_state, PLUS, ret, depth+1); - MARK_NAUGHTY(3); - RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; - goto nest_check; - } - MARK_NAUGHTY_EXP(2, 2); - reginsert(pRExC_state, CURLY, ret, depth+1); - Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */ - Set_Node_Cur_Length(REGNODE_p(ret), parse_start); - } - else { - const regnode_offset w = reg_node(pRExC_state, WHILEM); + /* Here was a '{', but what followed it didn't form a quantifier. */ + /* FALLTHROUGH */ - FLAGS(REGNODE_p(w)) = 0; - if (! REGTAIL(pRExC_state, ret, w)) { - REQUIRE_BRANCHJ(flagp, 0); - } - if (RExC_use_BRANCHJ) { - reginsert(pRExC_state, LONGJMP, ret, depth+1); - reginsert(pRExC_state, NOTHING, ret, depth+1); - NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */ - } - reginsert(pRExC_state, CURLYX, ret, depth+1); - /* MJD hk */ - Set_Node_Offset(REGNODE_p(ret), parse_start+1); - Set_Node_Length(REGNODE_p(ret), - op == '{' ? (RExC_parse - parse_start) : 1); - - if (RExC_use_BRANCHJ) - NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to - LONGJMP. */ - if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state, - NOTHING))) - { - REQUIRE_BRANCHJ(flagp, 0); - } - RExC_whilem_seen++; - MARK_NAUGHTY_EXP(1, 4); /* compound interest */ - } - FLAGS(REGNODE_p(ret)) = 0; - - if (min > 0) - *flagp = WORST; - if (max > 0) - *flagp |= HASWIDTH; - ARG1_SET(REGNODE_p(ret), (U16)min); - ARG2_SET(REGNODE_p(ret), (U16)max); - if (max == REG_INFTY) - RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; - - goto nest_check; - } + default: + *flagp = flags; + return(ret); + NOT_REACHED; /*NOTREACHED*/ } - if (!ISMULT1(op)) { - *flagp = flags; - return(ret); - } + /* Here we have a quantifier, and have calculated 'min' and 'max'. + * + * Check and possibly adjust a zero width operand */ + if (! (flags & (HASWIDTH|POSTPONED))) { + if (max > REG_INFTY/3) { + if (origparse[0] == '\\' && origparse[1] == 'K') { + vFAIL2utf8f( + "%" UTF8f " is forbidden - matches null string" + " many times", + UTF8fARG(UTF, (RExC_parse >= origparse + ? RExC_parse - origparse + : 0), + origparse)); + } else { + ckWARN2reg(RExC_parse, + "%" UTF8f " matches null string many times", + UTF8fARG(UTF, (RExC_parse >= origparse + ? RExC_parse - origparse + : 0), + origparse)); + } + } -#if 0 /* Now runtime fix should be reliable. */ + /* There's no point in trying to match something 0 length more than + * once except for extra side effects, which we don't have here since + * not POSTPONED */ + if (max > 1) { + max = 1; + if (min > max) { + min = max; + } + } + } - /* if this is reinstated, don't forget to put this back into perldiag: + /* If this is a code block pass it up */ + *flagp |= (flags & POSTPONED); - =item Regexp *+ operand could be empty at {#} in regex m/%s/ + if (max > 0) { + *flagp |= (flags & HASWIDTH); + if (max == REG_INFTY) + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; + } - (F) The part of the regexp subject to either the * or + quantifier - could match an empty string. The {#} shows in the regular - expression about where the problem was discovered. + /* 'SIMPLE' operands don't require full generality */ + if ((flags&SIMPLE)) { + if (max == REG_INFTY) { + if (min == 0) { + if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) { + goto min0_maxINF_wildcard_forbidden; + } - */ + reginsert(pRExC_state, STAR, ret, depth+1); + MARK_NAUGHTY(4); + goto done_main_op; + } + else if (min == 1) { + reginsert(pRExC_state, PLUS, ret, depth+1); + MARK_NAUGHTY(3); + goto done_main_op; + } + } - if (!(flags&HASWIDTH) && op != '?') - vFAIL("Regexp *+ operand could be empty"); -#endif + /* Here, SIMPLE, but not the '*' and '+' special cases */ -#ifdef RE_TRACK_PATTERN_OFFSETS - parse_start = RExC_parse; -#endif - nextchar(pRExC_state); + MARK_NAUGHTY_EXP(2, 2); + reginsert(pRExC_state, CURLY, ret, depth+1); + Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */ + Set_Node_Cur_Length(REGNODE_p(ret), parse_start); + } + else { /* not SIMPLE */ + const regnode_offset w = reg_node(pRExC_state, WHILEM); - *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); - - if (op == '*') { - min = 0; - goto do_curly; - } - else if (op == '+') { - min = 1; - goto do_curly; - } - else if (op == '?') { - min = 0; max = 1; - goto do_curly; - } - nest_check: - if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { - if (origparse[0] == '\\' && origparse[1] == 'K') { - vFAIL2utf8f( - "%" UTF8f " is forbidden - matches null string many times", - UTF8fARG(UTF, (RExC_parse >= origparse - ? RExC_parse - origparse - : 0), - origparse)); - /* NOT-REACHED */ - } else { - ckWARN2reg(RExC_parse, - "%" UTF8f " matches null string many times", - UTF8fARG(UTF, (RExC_parse >= origparse - ? RExC_parse - origparse - : 0), - origparse)); + FLAGS(REGNODE_p(w)) = 0; + if (! REGTAIL(pRExC_state, ret, w)) { + REQUIRE_BRANCHJ(flagp, 0); } + if (RExC_use_BRANCHJ) { + reginsert(pRExC_state, LONGJMP, ret, depth+1); + reginsert(pRExC_state, NOTHING, ret, depth+1); + NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */ + } + reginsert(pRExC_state, CURLYX, ret, depth+1); + /* MJD hk */ + Set_Node_Offset(REGNODE_p(ret), parse_start+1); + Set_Node_Length(REGNODE_p(ret), + op == '{' ? (RExC_parse - parse_start) : 1); + + if (RExC_use_BRANCHJ) + NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to + LONGJMP. */ + if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state, + NOTHING))) + { + REQUIRE_BRANCHJ(flagp, 0); + } + RExC_whilem_seen++; + MARK_NAUGHTY_EXP(1, 4); /* compound interest */ } + /* Finish up the CURLY/CURLYX case */ + FLAGS(REGNODE_p(ret)) = 0; + + ARG1_SET(REGNODE_p(ret), (U16)min); + ARG2_SET(REGNODE_p(ret), (U16)max); + + done_main_op: + + /* Process any greediness modifiers */ if (*RExC_parse == '?') { - nextchar(pRExC_state); - reginsert(pRExC_state, MINMOD, ret, depth+1); + nextchar(pRExC_state); + reginsert(pRExC_state, MINMOD, ret, depth+1); if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) { REQUIRE_BRANCHJ(flagp, 0); } @@ -12789,12 +12840,32 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } } + /* Forbid extra quantifiers */ if (ISMULT2(RExC_parse)) { - RExC_parse++; - vFAIL("Nested quantifiers"); + RExC_parse++; + vFAIL("Nested quantifiers"); } return(ret); + + min0_maxINF_wildcard_forbidden: + + /* Here we are in a wildcard match, and the minimum match length is 0, and + * the max could be infinity. This is currently forbidden. The only + * reason is to make it harder to write patterns that take a long long time + * to halt, and because the use of this construct isn't necessary in + * matching Unicode property values */ + RExC_parse++; + /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard + subpatterns in regex; marked by <-- HERE in m/%s/ + */ + vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard" + " subpatterns"); + + /* Note, don't need to worry about the input being '{0,}', as a '}' isn't + * legal at all in wildcards, so can't get this far */ + + NOT_REACHED; /*NOTREACHED*/ } STATIC bool @@ -13232,7 +13303,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf, (UV) flags); } - *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); + *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED); nextchar(pRExC_state); @@ -13395,7 +13466,6 @@ S_backref_value(char *p, char *e) STATIC regnode_offset S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { - dVAR; regnode_offset ret = 0; I32 flags = 0; char *parse_start; @@ -13404,7 +13474,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) DECLARE_AND_GET_RE_DEBUG_FLAGS; - *flagp = WORST; /* Tentatively. */ + *flagp = 0; /* Initialize. */ DEBUG_PARSE("atom"); @@ -13482,7 +13552,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf, (UV) flags); } - *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); + *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED); break; case '|': case ')': @@ -13528,7 +13598,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * /\A/ from /^/ in split. */ FLAGS(REGNODE_p(ret)) = 1; } - *flagp |= SIMPLE; goto finish_meta_pat; case 'G': if (RExC_pm_flags & PMf_WILDCARD) { @@ -13541,13 +13610,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } ret = reg_node(pRExC_state, GPOS); RExC_seen |= REG_GPOS_SEEN; - *flagp |= SIMPLE; goto finish_meta_pat; case 'K': - if (!RExC_in_lookbehind && !RExC_in_lookahead) { + if (!RExC_in_lookaround) { RExC_seen_zerolen++; ret = reg_node(pRExC_state, KEEPS); - *flagp |= SIMPLE; /* XXX:dmq : disabling in-place substitution seems to * be necessary here to avoid cases of memory corruption, as * with: C<$_="x" x 80; s/x\K/y/> -- rgs @@ -13567,7 +13634,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) else { ret = reg_node(pRExC_state, SEOL); } - *flagp |= SIMPLE; RExC_seen_zerolen++; /* Do not optimize RE away */ goto finish_meta_pat; case 'z': @@ -13578,7 +13644,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) else { ret = reg_node(pRExC_state, EOS); } - *flagp |= SIMPLE; RExC_seen_zerolen++; /* Do not optimize RE away */ goto finish_meta_pat; case 'C': @@ -13701,8 +13766,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reg_node(pRExC_state, op); FLAGS(REGNODE_p(ret)) = flags; - *flagp |= SIMPLE; - goto finish_meta_pat; } @@ -13877,10 +13940,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) num > 9 /* any numeric escape < RExC_npar is a backref */ && num >= RExC_npar - /* cannot be an octal escape if it starts with 8 */ - && *RExC_parse != '8' - /* cannot be an octal escape if it starts with 9 */ - && *RExC_parse != '9' + /* cannot be an octal escape if it starts with [89] */ + && ! inRANGE(*RExC_parse, '8', '9') ) { /* Probably not meant to be a backref, instead likely * to be an octal character escape, e.g. \35 or \777. @@ -14495,6 +14556,16 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * things */ maybe_exactfu = FALSE; + /* Although these two characters have folds that are + * locale-problematic, they also have folds to above Latin1 + * that aren't a problem. Doing these now helps at + * runtime. */ + if (UNLIKELY( ender == GREEK_CAPITAL_LETTER_MU + || ender == LATIN_CAPITAL_LETTER_SHARP_S)) + { + goto fold_anyway; + } + /* Here, we are adding a problematic fold character. * "Problematic" in this context means that its fold isn't * known until runtime. (The non-problematic code points @@ -14548,15 +14619,20 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *(s)++ = (U8) toFOLD(ender); } else { - UV folded = _to_uni_fold_flags( + UV folded; + + fold_anyway: + folded = _to_uni_fold_flags( ender, (U8 *) s, /* We have allocated extra space in 's' so can't run off the end */ &added_len, - FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0)); + FOLD_FLAGS_FULL + | (( ASCII_FOLD_RESTRICTED + || node_type == EXACTFL) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); if (UNLIKELY(len + added_len > max_string_len)) { overflowed = TRUE; break; @@ -14932,12 +15008,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * * The solution used here for peeking ahead is to look at that * next character. If it isn't ASCII punctuation, then it will - * be something that continues in an EXACTish node if there - * were space. We append the fold of it to s, having reserved - * enough room in s0 for the purpose. If we can't reasonably - * peek ahead, we instead assume the worst case: that it is - * something that would form the completion of a multi-char - * fold. + * be something that would continue on in an EXACTish node if + * there were space. We append the fold of it to s, having + * reserved enough room in s0 for the purpose. If we can't + * reasonably peek ahead, we instead assume the worst case: + * that it is something that would form the completion of a + * multi-char fold. * * If we can't split between s and ender, we work backwards * character-by-character down to s0. At each current point @@ -15135,23 +15211,18 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * have to map that back to the original */ if (need_to_fold_loc) { upper_fill = loc_correspondence[s - s_start]; - Safefree(locfold_buf); - Safefree(loc_correspondence); - if (upper_fill == 0) { FAIL2("panic: loc_correspondence[%d] is 0", (int) (s - s_start)); } + Safefree(locfold_buf); + Safefree(loc_correspondence); } else { upper_fill = s - s0; } goto reparse; } - else if (need_to_fold_loc) { - Safefree(locfold_buf); - Safefree(loc_correspondence); - } /* Here the node consists entirely of non-final multi-char * folds. (Likely it is all 'f's or all 's's.) There's no @@ -15159,6 +15230,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * whole thing */ len = old_s - s0; } + + if (need_to_fold_loc) { + Safefree(locfold_buf); + Safefree(loc_correspondence); + } } /* End of verifying node ends with an appropriate char */ /* We need to start the next node at the character that didn't fit @@ -15319,7 +15395,6 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) * sets up the bitmap and any flags, removing those code points from the * inversion list, setting it to NULL should it become completely empty */ - dVAR; PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST; assert(PL_regkind[OP(node)] == ANYOF); @@ -15359,9 +15434,7 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) ? end : NUM_ANYOF_CODE_POINTS - 1; for (i = start; i <= (int) high; i++) { - if (! ANYOF_BITMAP_TEST(node, i)) { - ANYOF_BITMAP_SET(node, i); - } + ANYOF_BITMAP_SET(node, i); } } invlist_iterfinish(*invlist_ptr); @@ -16176,7 +16249,7 @@ S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, if ( posix_warnings && RExC_warn_text - && av_top_index(RExC_warn_text) > -1) + && av_count(RExC_warn_text) > 0) { *posix_warnings = RExC_warn_text; } @@ -16428,7 +16501,10 @@ redo_curchar: /* If more than a single node returned, the nested * parens evaluated to more than just a (?[...]), * which isn't legal */ - || node != 1) { + || RExC_emit != orig_emit + + NODE_STEP_REGNODE + + regarglen[REGEX_SET]) + { vFAIL("Expecting interpolated extended charclass"); } resultant_invlist = (SV *) ARGp(REGNODE_p(node)); @@ -16479,6 +16555,8 @@ redo_curchar: goto regclass_failed; } + assert(current); + /* regclass() will return with parsing just the \ sequence, * leaving the parse pointer at the next thing to parse */ RExC_parse--; @@ -16516,9 +16594,7 @@ redo_curchar: goto regclass_failed; } - if (! current) { - break; - } + assert(current); /* function call leaves parse pointing to the ']', except if we * faked it */ @@ -16827,7 +16903,7 @@ redo_curchar: if (RExC_sets_depth) { /* If within a recursive call, return in a special regnode */ RExC_parse++; - node = regpnode(pRExC_state, REGEX_SET, (void *) final); + node = regpnode(pRExC_state, REGEX_SET, final); } else { @@ -17176,10 +17252,10 @@ S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN c * * There is a line below that uses the same white space criteria but is outside * this macro. Both here and there must use the same definition */ -#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \ +#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p) \ STMT_START { \ if (do_skip) { \ - while (isBLANK_A(UCHARAT(p))) \ + while (p < stop_p && isBLANK_A(UCHARAT(p))) \ { \ p++; \ } \ @@ -17229,7 +17305,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * UTF-8 */ - dVAR; UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE; IV range = 0; UV value = OOB_UNICODE, save_value = OOB_UNICODE; @@ -17333,6 +17408,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, PERL_UNUSED_ARG(depth); #endif + assert(! (ret_invlist && allow_mutiple_chars)); /* If wants an inversion list returned, we can't optimize to something * else. */ @@ -17354,7 +17430,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, initial_listsv_len = SvCUR(listsv); SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ - SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); + SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end); assert(RExC_parse <= RExC_end); @@ -17363,7 +17439,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, invert = TRUE; allow_mutiple_chars = FALSE; MARK_NAUGHTY(1); - SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); + SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end); } /* Check that they didn't say [:posix:] instead of [[:posix:]] */ @@ -17410,12 +17486,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, output_posix_warnings(pRExC_state, posix_warnings); } + SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end); + if (RExC_parse >= stop_ptr) { break; } - SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); - if (UCHARAT(RExC_parse) == ']') { break; } @@ -17670,6 +17746,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* If set TRUE, the property is user-defined as opposed to * official Unicode */ bool user_defined = FALSE; + AV * strings = NULL; SV * prop_definition = parse_uniprop_string( name, n, UTF, FOLD, @@ -17680,6 +17757,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * this call */ ! cBOOL(ret_invlist), + &strings, &user_defined, msg, 0 /* Base level */ @@ -17697,7 +17775,66 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, SvCUR(msg), SvPVX(msg))); } - if (! is_invlist(prop_definition)) { + assert(prop_definition || strings); + + if (strings) { + if (ret_invlist) { + if (! prop_definition) { + RExC_parse = e + 1; + vFAIL("Unicode string properties are not implemented in (?[...])"); + } + else { + ckWARNreg(e + 1, + "Using just the single character results" + " returned by \\p{} in (?[...])"); + } + } + else if (! RExC_in_multi_char_class) { + if (invert ^ (value == 'P')) { + RExC_parse = e + 1; + vFAIL("Inverting a character class which contains" + " a multi-character sequence is illegal"); + } + + /* For each multi-character string ... */ + while (av_count(strings) > 0) { + /* ... Each entry is itself an array of code + * points. */ + AV * this_string = (AV *) av_shift( strings); + STRLEN cp_count = av_count(this_string); + SV * final = newSV(cp_count * 4); + SvPVCLEAR(final); + + /* Create another string of sequences of \x{...} */ + while (av_count(this_string) > 0) { + SV * character = av_shift(this_string); + UV cp = SvUV(character); + + if (cp > 255) { + REQUIRE_UTF8(flagp); + } + Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}", + cp); + SvREFCNT_dec_NN(character); + } + SvREFCNT_dec_NN(this_string); + + /* And add that to the list of such things */ + multi_char_matches + = add_multi_match(multi_char_matches, + final, + cp_count); + } + } + SvREFCNT_dec_NN(strings); + } + + if (! prop_definition) { /* If we got only a string, + this iteration didn't really + find a character */ + element_count--; + } + else if (! is_invlist(prop_definition)) { /* Here, the definition isn't known, so we have gotten * returned a string that will be evaluated if and when @@ -18043,7 +18180,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } } /* end of namedclass \blah */ - SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); + SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end); /* If 'range' is set, 'value' is the ending of a range--check its * validity. (If value isn't a single code point in the case of a @@ -18086,7 +18223,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, char* next_char_ptr = RExC_parse + 1; /* Get the next real char after the '-' */ - SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr); + SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end); /* If the '-' is at the end of the class (just before the ']', * it is a literal minus; otherwise it is a range */ @@ -18223,7 +18360,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * printable should have each end point be a portable value * for it (preferably like 'A', but we don't warn if it is * a (portable) Unicode name or code point), and the range - * must be be all digits or all letters of the same case. + * must be all digits or all letters of the same case. * Otherwise, the range is non-portable and unclear as to * what it contains */ if ( (isPRINT_A(prevvalue) || isPRINT_A(value)) @@ -18415,7 +18552,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } #endif - /* Look at the longest folds first */ + /* Look at the longest strings first */ for (cp_count = av_tindex_skip_len_mg(multi_char_matches); cp_count > 0; cp_count--) @@ -18441,15 +18578,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } /* If the character class contains anything else besides these - * multi-character folds, have to include it in recursive parsing */ + * multi-character strings, have to include it in recursive parsing */ if (element_count) { - sv_catpvs(substitute_parse, "|["); + bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '['; + + sv_catpvs(substitute_parse, "|"); + if (has_l_bracket) { /* Add an [ if the original had one */ + sv_catpvs(substitute_parse, "["); + } constructed_prefix_len = SvCUR(substitute_parse); sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse); - /* Put in a closing ']' only if not going off the end, as otherwise - * we are adding something that really isn't there */ - if (RExC_parse < RExC_end) { + /* Put in a closing ']' to match any opening one, but not if going + * off the end, as otherwise we are adding something that really + * isn't there */ + if (has_l_bracket && RExC_parse < RExC_end) { sv_catpvs(substitute_parse, "]"); } } @@ -18475,7 +18618,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, ret = reg(pRExC_state, 1, ®_flags, depth+1); - *flagp |= reg_flags & (HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PARSE|NEED_UTF8); + *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8); /* And restore so can parse the rest of the pattern */ RExC_parse = save_parse; @@ -18875,7 +19018,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (ret_invlist) { *ret_invlist = cp_list; - return RExC_emit; + return (cp_list) ? RExC_emit : 0; } if (anyof_flags & ANYOF_LOCALE_FLAGS) { @@ -19216,7 +19359,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * points) in the ASCII range, so we can't use it here to * artificially restrict the fold domain, so we check if * the class does or does not match some EXACTFish node. - * Further, if we aren't under /i, and and the folded-to + * Further, if we aren't under /i, and the folded-to * character is part of a multi-character fold, we can't do * this optimization, as the sequence around it could be * that multi-character fold, and we don't here know the @@ -19883,6 +20026,9 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list)); } + /* (Note that if any of this changes, the size calculations in + * S_optimize_regclass() might need to be updated.) */ + if (only_utf8_locale_list) { av_store(av, ONLY_LOCALE_MATCHES_INDEX, SvREFCNT_inc_NN(only_utf8_locale_list)); @@ -20379,12 +20525,12 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) } /* -- regpnode - emit a temporary node with a void* argument +- regpnode - emit a temporary node with a SV* argument */ STATIC regnode_offset /* Location. */ -S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, void * arg) +S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg) { - const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regvnode"); + const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regpnode"); regnode_offset ptr = ret; PERL_ARGS_ASSERT_REGPNODE; @@ -20545,7 +20691,8 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state, PERL_UNUSED_ARG(depth); #endif - /* Find last node. */ + /* The final node in the chain is the first one with a nonzero next pointer + * */ scan = (regnode_offset) p; for (;;) { regnode * const temp = regnext(REGNODE_p(scan)); @@ -20563,6 +20710,7 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state, scan = REGNODE_OFFSET(temp); } + /* Populate this node's next pointer */ assert(val >= scan); if (reg_off_by_arg[OP(REGNODE_p(scan))]) { assert((UV) (val - scan) <= U32_MAX); @@ -20631,30 +20779,15 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, } #endif if ( exact ) { - switch (OP(REGNODE_p(scan))) { - case LEXACT: - case EXACT: - case LEXACT_REQ8: - case EXACT_REQ8: - case EXACTL: - case EXACTF: - case EXACTFU_S_EDGE: - case EXACTFAA_NO_TRIE: - case EXACTFAA: - case EXACTFU: - case EXACTFU_REQ8: - case EXACTFLU8: - case EXACTFUP: - case EXACTFL: - if( exact == PSEUDO ) - exact= OP(REGNODE_p(scan)); - else if ( exact != OP(REGNODE_p(scan)) ) - exact= 0; - case NOTHING: - break; - default: + if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) { + if (exact == PSEUDO ) + exact= OP(REGNODE_p(scan)); + else if (exact != OP(REGNODE_p(scan)) ) exact= 0; } + else if (OP(REGNODE_p(scan)) != NOTHING) { + exact= 0; + } } DEBUG_PARSE_r({ DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); @@ -20955,7 +21088,6 @@ void Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state) { #ifdef DEBUGGING - dVAR; int k; RXi_GET_DECL(prog, progi); DECLARE_AND_GET_RE_DEBUG_FLAGS; @@ -21314,11 +21446,16 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ : (OP(o) == ANYOFH || OP(o) == ANYOFR) ? 0xFF : lowest; - Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest); - if (lowest != highest) { - Perl_sv_catpvf(aTHX_ sv, "-%02X", highest); +#ifndef EBCDIC + if (OP(o) != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o))) +#endif + { + Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest); + if (lowest != highest) { + Perl_sv_catpvf(aTHX_ sv, "-%02X", highest); + } + Perl_sv_catpvf(aTHX_ sv, ")"); } - Perl_sv_catpvf(aTHX_ sv, ")"); } SvREFCNT_dec(unresolved); @@ -21397,7 +21534,9 @@ SV * Perl_re_intuit_string(pTHX_ REGEXP * const r) { /* Assume that RE_INTUIT is set */ /* Returns an SV containing a string that must appear in the target for it - * to match */ + * to match, or NULL if nothing is known that must match. + * + * CAUTION: the SV can be freed during execution of the regex engine */ struct regexp *const prog = ReANY(r); DECLARE_AND_GET_RE_DEBUG_FLAGS; @@ -21666,7 +21805,6 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) U32 refcount; reg_ac_data *aho=(reg_ac_data*)ri->data->data[n]; #ifdef USE_ITHREADS - dVAR; #endif OP_REFCNT_LOCK; refcount = --aho->refcount; @@ -21695,7 +21833,6 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) U32 refcount; reg_trie_data *trie=(reg_trie_data*)ri->data->data[n]; #ifdef USE_ITHREADS - dVAR; #endif OP_REFCNT_LOCK; refcount = --trie->refcount; @@ -21731,15 +21868,18 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) #define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL) /* - re_dup_guts - duplicate a regexp. +=for apidoc re_dup_guts +Duplicate a regexp. - This routine is expected to clone a given regexp structure. It is only - compiled under USE_ITHREADS. +This routine is expected to clone a given regexp structure. It is only +compiled under USE_ITHREADS. - After all of the core data stored in struct regexp is duplicated - the regexp_engine.dupe method is used to copy any private data - stored in the *pprivate pointer. This allows extensions to handle - any duplication it needs to do. +After all of the core data stored in struct regexp is duplicated +the C method is used to copy any private data +stored in the *pprivate pointer. This allows extensions to handle +any duplication they need to do. + +=cut See pregfree() and regfree_internal() if you change anything here. */ @@ -21748,7 +21888,6 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) void Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) { - dVAR; I32 npar; const struct regexp *r = ReANY(sstr); struct regexp *ret = ReANY(dstr); @@ -21851,7 +21990,6 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) void * Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) { - dVAR; struct regexp *const r = ReANY(rx); regexp_internal *reti; int len; @@ -22090,8 +22228,6 @@ S_put_code_point(pTHX_ SV *sv, UV c) } } -#define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C - STATIC void S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) { @@ -22116,9 +22252,11 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) UV this_end; const char * format; - if (end - start < min_range_count) { - - /* Output chars individually when they occur in short ranges */ + if ( end - start < min_range_count + && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end)))) + { + /* Output a range of 1 or 2 chars individually, or longer ranges + * when printable */ for (; start <= end; start++) { put_code_point(sv, start); } @@ -22342,7 +22480,6 @@ S_put_charclass_bitmap_innards_common(pTHX_ * output would have been only the inversion indicator '^', NULL is instead * returned. */ - dVAR; SV * output; PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON; @@ -22449,7 +22586,6 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, * cases where it can't try inverting, as what actually matches isn't known * until runtime, and hence the inversion isn't either. */ - dVAR; bool inverting_allowed = ! force_as_is_display; int i; @@ -22590,7 +22726,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, int inverted_bias, as_is_bias; - /* We will apply our bias to whichever of the the results doesn't have + /* We will apply our bias to whichever of the results doesn't have * the '^' */ if (invert) { invert = FALSE; @@ -22842,7 +22978,6 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, void Perl_init_uniprops(pTHX) { - dVAR; # ifdef DEBUGGING char * dump_len_string; @@ -23328,6 +23463,7 @@ S_handle_user_defined_property(pTHX_ this_definition = parse_uniprop_string(s0, s - s0, is_utf8, to_fold, runtime, deferrable, + NULL, user_defined_ptr, msg, (name_len == 0) ? level /* Don't increase level @@ -23419,7 +23555,7 @@ S_handle_user_defined_property(pTHX_ # define CUR_CONTEXT aTHX # define ORIGINAL_CONTEXT save_aTHX # else -# define DECLARATION_FOR_GLOBAL_CONTEXT +# define DECLARATION_FOR_GLOBAL_CONTEXT dNOOP # define SWITCH_TO_GLOBAL_CONTEXT NOOP # define RESTORE_CONTEXT NOOP # define CUR_CONTEXT NULL @@ -23433,7 +23569,6 @@ S_delete_recursion_entry(pTHX_ void *key) * properties. This is a function so it can be set up to be called even if * the program unexpectedly quits */ - dVAR; SV ** current_entry; const STRLEN key_len = strlen((const char *) key); DECLARATION_FOR_GLOBAL_CONTEXT; @@ -23517,6 +23652,8 @@ S_parse_uniprop_string(pTHX_ const bool runtime, /* TRUE if this is being called at run time */ const bool deferrable, /* TRUE if it's ok for the definition to not be known at this call */ + AV ** strings, /* To return string property values, like named + sequences */ bool *user_defined_ptr, /* Upon return from this function it will be set to TRUE if any component is a user-defined property */ @@ -23524,7 +23661,6 @@ S_parse_uniprop_string(pTHX_ this */ const STRLEN level) /* Recursion level of this call */ { - dVAR; char* lookup_name; /* normalized name for lookup in our tables */ unsigned lookup_len; /* Its length */ enum { Not_Strict = 0, /* Some properties have stricter name */ @@ -23767,7 +23903,8 @@ S_parse_uniprop_string(pTHX_ /* Farm this out to a function just to make the current * function less unwieldy */ if (handle_names_wildcard(revised_name, revised_name_len, - &prop_definition)) + &prop_definition, + strings)) { return prop_definition; } @@ -23816,6 +23953,7 @@ S_parse_uniprop_string(pTHX_ to_fold, runtime, deferrable, + NULL, user_defined_ptr, msg, level + 1); @@ -23914,7 +24052,7 @@ S_parse_uniprop_string(pTHX_ goto append_name_to_msg; } - lookup_loose = get_cv("_charnames::_loose_regcomp_lookup", 0); + lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0); if (! lookup_loose) { Perl_croak(aTHX_ "panic: Can't find '_charnames::_loose_regcomp_lookup"); @@ -23945,12 +24083,36 @@ S_parse_uniprop_string(pTHX_ } cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len); - if (character_len < SvCUR(character)) { - /* Temporarily, named sequences aren't handled */ - goto failed; + if (character_len == SvCUR(character)) { + prop_definition = add_cp_to_invlist(NULL, cp); + } + else { + AV * this_string; + + /* First of the remaining characters in the string. */ + char * remaining = SvPVX(character) + character_len; + + if (strings == NULL) { + goto failed; /* XXX Perhaps a specific msg instead, like + 'not available here' */ + } + + if (*strings == NULL) { + *strings = newAV(); + } + + this_string = newAV(); + av_push(this_string, newSVuv(cp)); + + do { + cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len); + av_push(this_string, newSVuv(cp)); + remaining += character_len; + } while (remaining < SvEND(character)); + + av_push(*strings, (SV *) this_string); } - prop_definition = add_cp_to_invlist(NULL, cp); return prop_definition; } @@ -24519,8 +24681,10 @@ S_parse_uniprop_string(pTHX_ /* Try again stripping off any initial 'Is'. This is because we * promise that an initial Is is optional. The same isn't true of * names that start with 'In'. Those can match only blocks, and the - * lookup table already has those accounted for. */ - if (starts_with_Is) { + * lookup table already has those accounted for. The lookup table also + * has already accounted for Perl extensions (without and = sign) + * starting with 'i's'. */ + if (starts_with_Is && equals_pos >= 0) { lookup_name += 2; lookup_len -= 2; equals_pos -= 2; @@ -24868,7 +25032,8 @@ S_parse_uniprop_string(pTHX_ STATIC bool S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */ const STRLEN wname_len, /* Its length */ - SV ** prop_definition) + SV ** prop_definition, + AV ** strings) { /* Deal with Name property wildcard subpatterns; returns TRUE if there were * any matches, adding them to prop_definition */ @@ -24887,8 +25052,10 @@ S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */ where we are now */ bool found_matches = FALSE; /* Did any name match so far? */ SV * empty; /* For matching zero length names */ - SV * must; /* What substring, if any, must be in a name - for the subpattern to match */ + SV * must_sv; /* Contains the substring, if any, that must be + in a name for the subpattern to match */ + const char * must; /* The PV of 'must' */ + STRLEN must_len; /* And its length */ SV * syllable_name = NULL; /* For Hangul syllables */ const char hangul_prefix[] = "HANGUL SYLLABLE "; const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1; @@ -24953,7 +25120,23 @@ S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */ /* Compile the subpattern consisting of the name being looked for */ subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ ); - must = re_intuit_string(subpattern_re); + + must_sv = re_intuit_string(subpattern_re); + if (must_sv) { + /* regexec.c can free the re_intuit_string() return. GH #17734 */ + must_sv = sv_2mortal(newSVsv(must_sv)); + must = SvPV(must_sv, must_len); + } + else { + must = ""; + must_len = 0; + } + + /* (Note: 'must' could contain a NUL. And yet we use strspn() below on it. + * This works because the NUL causes the function to return early, thus + * showing that there are characters in it other than the acceptable ones, + * which is our desired result.) */ + prog = ReANY(subpattern_re); /* If only nothing is matched, skip to where empty names are looked for */ @@ -24963,10 +25146,7 @@ S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */ /* And match against the string of all names /gc. Don't even try if it * must match a character not found in any name. */ - if ( ! must - || SvCUR(must) == 0 - || strspn(SvPVX(must), "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") - == SvCUR(must)) + if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len) { while (execute_wildcard(subpattern_re, cur_pos, @@ -24986,7 +25166,9 @@ S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */ + RX_OFFS(subpattern_re)->end; char * cp_start; char * cp_end; - UV cp; + UV cp = 0; /* Silences some compilers */ + AV * this_string = NULL; + bool is_multi = FALSE; /* If matched nothing, advance to next possible match */ if (this_name_start == this_name_end) { @@ -25021,26 +25203,69 @@ S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */ /* All code points are 5 digits long */ cp_start = cp_end - 4; + /* This shouldn't happen, as we found a \n, and the first \n is + * further along than what we subtracted */ + assert(cp_start >= all_names_start); + + if (cp_start == all_names_start) { + *prop_definition = add_cp_to_invlist(*prop_definition, 0); + continue; + } + + /* If the character is a blank, we either have a named sequence, or + * something is wrong */ + if (*(cp_start - 1) == ' ') { + cp_start = (char *) my_memrchr(all_names_start, + '\n', + cp_start - all_names_start); + cp_start++; + } + + assert(cp_start != NULL && cp_start >= all_names_start + 2); + /* Except for the first line in the string, the sequence before the * code point is \n\n. If that isn't the case here, we didn't * match the name of a character. (We could have matched a named * sequence, not currently handled */ - if ( cp_start > all_names_start + 1 - && (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n')) - { + if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') { continue; } - /* Calculate the code point from its 5 digits */ - cp = (XDIGIT_VALUE(cp_start[0]) << 16) - + (XDIGIT_VALUE(cp_start[1]) << 12) - + (XDIGIT_VALUE(cp_start[2]) << 8) - + (XDIGIT_VALUE(cp_start[3]) << 4) - + XDIGIT_VALUE(cp_start[4]); - /* We matched! Add this to the list */ - *prop_definition = add_cp_to_invlist(*prop_definition, cp); found_matches = TRUE; + + /* Loop through all the code points in the sequence */ + while (cp_start < cp_end) { + + /* Calculate this code point from its 5 digits */ + cp = (XDIGIT_VALUE(cp_start[0]) << 16) + + (XDIGIT_VALUE(cp_start[1]) << 12) + + (XDIGIT_VALUE(cp_start[2]) << 8) + + (XDIGIT_VALUE(cp_start[3]) << 4) + + XDIGIT_VALUE(cp_start[4]); + + cp_start += 6; /* Go past any blank */ + + if (cp_start < cp_end || is_multi) { + if (this_string == NULL) { + this_string = newAV(); + } + + is_multi = TRUE; + av_push(this_string, newSVuv(cp)); + } + } + + if (is_multi) { /* Was more than one code point */ + if (*strings == NULL) { + *strings = newAV(); + } + + av_push(*strings, (SV *) this_string); + } + else { /* Only a single code point */ + *prop_definition = add_cp_to_invlist(*prop_definition, cp); + } } /* End of loop through the non-algorithmic names string */ } @@ -25061,9 +25286,7 @@ S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */ * one of the characters in that isn't in any Hangul syllable. */ if ( prog->minlen <= (SSize_t) syl_max_len && prog->maxlen > 0 - && ( ! must - || SvCUR(must) == 0 - || strspn(SvPVX(must), "\n ABCDEGHIJKLMNOPRSTUWY") == SvCUR(must))) + && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len)) { /* These constants, names, values, and algorithm are adapted from the * Unicode standard, version 5.1, section 3.12, and should never @@ -25158,9 +25381,7 @@ S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */ * series */ if ( prog->minlen <= (SSize_t) SvCUR(algo_name) && prog->maxlen > 0 - && ( ! must - || SvCUR(must) == 0 - || strspn(SvPVX(must), legal) == SvCUR(must))) + && (strspn(must, legal) == must_len)) { for (j = low; j <= high; j++) { /* For each code point in the series */