From d8d1dede53afc4f33cf63203b0992459fe964dc3 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 17 Feb 2020 12:07:07 -0700 Subject: [PATCH] Improve handling of nested qr/(?[...])/ A set operations expression can contain a previously-compiled one interpolated in. Prior to this commit, some heuristics were employed to verify it actually was such a thing, and not a sort of look-alike that wasn't necessarily valid. The heuristics actually forbade legal ones. I don't know of any illegal ones that were let through, but it is certainly possible. Also, the error/warning messages referred to the heuristics, and were unhelpful at best. The technique used instead in this commit is to return a regop only used by this feature for any nested compilations. This guarantees that the caller can determine if the result is valid, and what that result is without having to do any heuristics or inspecting any flags. The error/warning messages are changed to reflect this, and I believe are now helpful. This fixes the bugs in #16779 https://github.com/Perl/perl5/issues/16779#issuecomment-563987618 --- embed.fnc | 2 + embed.h | 1 + pod/perldelta.pod | 34 +++++++++++++++++ pod/perldiag.pod | 36 +++++------------- proto.h | 3 ++ regcomp.c | 109 +++++++++++++++++++++++++++++------------------------- t/re/reg_mesg.t | 2 +- t/re/regex_sets.t | 6 +++ 8 files changed, 114 insertions(+), 79 deletions(-) diff --git a/embed.fnc b/embed.fnc index 20a4120..e8ed893 100644 --- a/embed.fnc +++ b/embed.fnc @@ -2583,6 +2583,8 @@ ES |regnode_offset|regnode_guts|NN RExC_state_t *pRExC_state \ ES |void |change_engine_size|NN RExC_state_t *pRExC_state|const Ptrdiff_t size ES |regnode_offset|reganode|NN RExC_state_t *pRExC_state|U8 op \ |U32 arg +ES |regnode_offset|regpnode|NN RExC_state_t *pRExC_state|U8 op \ + |NN void * arg ES |regnode_offset|reg2Lanode|NN RExC_state_t *pRExC_state \ |const U8 op \ |const U32 arg1 \ diff --git a/embed.h b/embed.h index a1f0418..6d0346a 100644 --- a/embed.h +++ b/embed.h @@ -1052,6 +1052,7 @@ #define reginsert(a,b,c,d) S_reginsert(aTHX_ a,b,c,d) #define regnode_guts(a,b,c,d) S_regnode_guts(aTHX_ a,b,c,d) #define regpiece(a,b,c) S_regpiece(aTHX_ a,b,c) +#define regpnode(a,b,c) S_regpnode(aTHX_ a,b,c) #define regtail(a,b,c,d) S_regtail(aTHX_ a,b,c,d) #define scan_commit(a,b,c,d) S_scan_commit(aTHX_ a,b,c,d) #define set_ANYOF_arg(a,b,c,d,e) S_set_ANYOF_arg(aTHX_ a,b,c,d,e) diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 36daf64..3bc0faf 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -80,6 +80,14 @@ specification. There is clearly no demand for them, given that no one has ever complained in the many years the functions were claimed to be available, hence so-called "support" for them is now dropped. +=head2 A bug fix for C<(?[...])> may have caused some patterns to no +longer compile + +See L. The heuristics previously used may have let +some constructs compile (perhaps not with the programmer's intended +effect) that should have been errors. None are known, but it is +possible that some erroneous constructs no longer compile. + =head1 Deprecations XXX Any deprecated features, syntax, modules etc. should be listed here. @@ -262,6 +270,12 @@ and New Warnings XXX L +L%sE +|perldiag/"Expecting interpolated extended charclass in regex; marked by <-- HERE in mE%sE"> + +This is a replacement for several error messages listed under +L. + =back =head3 New Warnings @@ -357,6 +371,18 @@ Some instances of this message previously output the hex digits C, C, C, C, C, and C in lower case. Now they are all consistently upper case. +=item * + +The following three diagnostics have been removed, and replaced by +L%sE> +|perldiag/"Expecting interpolated extended charclass in regex; marked by <-- HERE in mE%sE">. +C%sE>, +C%sE>, +and +C in mE%sE>. + =back =head1 Utility Changes @@ -517,6 +543,14 @@ eg. on C. This has been fixed [GH #17428] C<(?{...})> eval groups in regular expressions no longer unintentionally trigger "EVAL without pos change exceeded limit in regex" [GH #17490]. +=item * + +C<(?[...])> extended bracketed character classes do not wrongly raise an +error on some cases where a previously-compiled such class is +interpolated into another. The heuristics previously used have been +replaced by a reliable method, and hence the diagnostics generated have +changed. See L. + =back =head1 Known Problems diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 6156fb2..fa51f1d 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -2292,36 +2292,18 @@ to denote a capturing group of the form L)>|perlre/(?PARNO) (?-PARNO) (?+PARNO) (?R) (?0)>, but omitted the C<")">. -=item Expecting close paren for nested extended charclass in regex; marked -by <-- HERE in m/%s/ - -(F) While parsing a nested extended character class like: - - (?[ ... (?flags:(?[ ... ])) ... ]) - ^ - -we expected to see a close paren ')' (marked by ^) but did not. - -=item Expecting close paren for wrapper for nested extended charclass in -regex; marked by <-- HERE in m/%s/ - -(F) While parsing a nested extended character class like: - - (?[ ... (?flags:(?[ ... ])) ... ]) - ^ +=item Expecting interpolated extended charclass in regex; marked by <-- +HERE in m/%s/ -we expected to see a close paren ')' (marked by ^) but did not. +(F) It looked like you were attempting to interpolate an +already-compiled extended character class, like so: -=item Expecting '(?flags:(?[...' in regex; marked by S<<-- HERE> in m/%s/ + my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/; + ... + qr/(?[ \p{Digit} & $thai_or_lao ])/; -(F) The C<(?[...])> extended character class regular expression construct -only allows character classes (including character class escapes like -C<\d>), operators, and parentheses. The one exception is C<(?flags:...)> -containing at least one flag and exactly one C<(?[...])> construct. -This allows a regular expression containing just C<(?[...])> to be -interpolated. If you see this error message, then you probably -have some other C<(?...)> construct inside your character class. See -L. +But the marked code isn't syntactically correct to be such an +interpolated class. =item Experimental aliasing via reference not enabled diff --git a/proto.h b/proto.h index 4b8beab..266ba598 100644 --- a/proto.h +++ b/proto.h @@ -5715,6 +5715,9 @@ STATIC regnode_offset S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 o STATIC regnode_offset S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth); #define PERL_ARGS_ASSERT_REGPIECE \ assert(pRExC_state); assert(flagp) +STATIC regnode_offset S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, void * arg); +#define PERL_ARGS_ASSERT_REGPNODE \ + assert(pRExC_state); assert(arg) STATIC bool S_regtail(pTHX_ RExC_state_t * pRExC_state, const regnode_offset p, const regnode_offset val, const U32 depth) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_REGTAIL \ diff --git a/regcomp.c b/regcomp.c index 2d2dc8b..1d758a9 100644 --- a/regcomp.c +++ b/regcomp.c @@ -142,6 +142,8 @@ struct RExC_state_t { U32 seen; SSize_t size; /* Number of regnode equivalents in pattern */ + Size_t sets_depth; /* Counts recursion depth of already- + compiled regex set patterns */ /* position beyond 'precomp' of the warning message furthest away from * 'precomp'. During the parse, no warnings are raised for any problems @@ -266,6 +268,7 @@ struct RExC_state_t { #define RExC_paren_names (pRExC_state->paren_names) #define RExC_recurse (pRExC_state->recurse) #define RExC_recurse_count (pRExC_state->recurse_count) +#define RExC_sets_depth (pRExC_state->sets_depth) #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) #define RExC_study_chunk_recursed_bytes \ (pRExC_state->study_chunk_recursed_bytes) @@ -6421,6 +6424,11 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", if (trie->jump) /* no more substrings -- for now /grr*/ flags &= ~SCF_DO_SUBSTR; } + else if (OP(scan) == REGEX_SET) { + Perl_croak(aTHX_ "panic: %s regnode should be resolved" + " before optimization", reg_name[REGEX_SET]); + } + #endif /* old or new */ #endif /* TRIE_STUDY_OPT */ @@ -7670,6 +7678,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_study_chunk_recursed = NULL; RExC_study_chunk_recursed_bytes= 0; RExC_recurse_count = 0; + RExC_sets_depth = 0; pRExC_state->code_index = 0; /* Initialize the string in the compiled pattern. This is so that there is @@ -16229,6 +16238,9 @@ redo_curchar: && UCHARAT(RExC_parse + 1) == '?' && UCHARAT(RExC_parse + 2) == '^') { + const regnode_offset orig_emit = RExC_emit; + SV * resultant_invlist; + /* If is a '(?^', could be an embedded '(?^flags:(?[...])'. * This happens when we have some thing like * @@ -16238,62 +16250,33 @@ redo_curchar: * * Here we would be handling the interpolated * '$thai_or_lao'. We handle this by a recursive call to - * ourselves which returns the inversion list the - * interpolated expression evaluates to. We use the flags - * from the interpolated pattern. */ - U32 save_flags = RExC_flags; - const char * save_parse; - - RExC_parse += 2; /* Skip past the '(?' */ - save_parse = RExC_parse; - - /* Parse the flags for the '(?'. We already know the first - * flag to parse is a '^' */ - parse_lparen_question_flags(pRExC_state); - - if ( RExC_parse >= RExC_end - 4 - || UCHARAT(RExC_parse) != ':' - || UCHARAT(++RExC_parse) != '(' - || UCHARAT(++RExC_parse) != '?' - || UCHARAT(++RExC_parse) != '[') - { + * reg which returns the inversion list the + * interpolated expression evaluates to. Actually, the + * return is a special regnode containing a pointer to that + * inversion list. If the return isn't that regnode alone, + * we know that this wasn't such an interpolation, which is + * an error: we need to get a single inversion list back + * from the recursion */ - /* In combination with the above, this moves the - * pointer to the point just after the first erroneous - * character. */ - if (RExC_parse >= RExC_end - 4) { - RExC_parse = RExC_end; - } - else if (RExC_parse != save_parse) { - RExC_parse += (UTF) - ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) - : 1; - } - vFAIL("Expecting '(?flags:(?[...'"); - } - - /* Recurse, with the meat of the embedded expression */ RExC_parse++; - if (! handle_regex_sets(pRExC_state, ¤t, flagp, - depth+1, oregcomp_parse)) - { - RETURN_FAIL_ON_RESTART(*flagp, flagp); - } + RExC_sets_depth++; - /* Here, 'current' contains the embedded expression's - * inversion list, and RExC_parse points to the trailing - * ']'; the next character should be the ')' */ - RExC_parse++; - if (UCHARAT(RExC_parse) != ')') - vFAIL("Expecting close paren for nested extended charclass"); + node = reg(pRExC_state, 2, flagp, depth+1); + RETURN_FAIL_ON_RESTART(*flagp, flagp); - /* Then the ')' matching the original '(' handled by this - * case: statement */ - RExC_parse++; - if (UCHARAT(RExC_parse) != ')') - vFAIL("Expecting close paren for wrapper for nested extended charclass"); + if ( OP(REGNODE_p(node)) != REGEX_SET + /* If more than a single node returned, the nested + * parens evaluated to more than just a (?[...]), + * which isn't legal */ + || node != 1) { + vFAIL("Expecting interpolated extended charclass"); + } + resultant_invlist = (SV *) ARGp(REGNODE_p(node)); + current = invlist_clone(resultant_invlist, NULL); + SvREFCNT_dec(resultant_invlist); - RExC_flags = save_flags; + RExC_sets_depth--; + RExC_emit = orig_emit; goto handle_operand; } @@ -16681,6 +16664,13 @@ redo_curchar: return END; } + if (RExC_sets_depth) { /* If within a recursive call, return in a special + regnode */ + RExC_parse++; + node = regpnode(pRExC_state, REGEX_SET, (void *) final); + } + else { + /* Otherwise generate a resultant node, based on 'final'. regclass() is * expecting a string of ranges and individual code points */ invlist_iterinit(final); @@ -16764,6 +16754,7 @@ redo_curchar: ANYOF_FLAGS(REGNODE_p(node)) |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; } + } nextchar(pRExC_state); Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */ @@ -20216,6 +20207,22 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) return(ret); } +/* +- regpnode - emit a temporary node with a void* argument +*/ +STATIC regnode_offset /* Location. */ +S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, void * arg) +{ + const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regvnode"); + regnode_offset ptr = ret; + + PERL_ARGS_ASSERT_REGPNODE; + + FILL_ADVANCE_NODE_ARGp(ptr, op, arg); + RExC_emit = ptr; + return(ret); +} + STATIC regnode_offset S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2) { diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t index 29d1ae9..5fea6fe 100644 --- a/t/re/reg_mesg.t +++ b/t/re/reg_mesg.t @@ -311,7 +311,7 @@ my @death = '/\p{Latin}{,4 }/' => 'Unescaped left brace in regex is illegal here {#} m/\p{Latin}{{#},4 }/', '/(?<=/' => 'Sequence (?... not terminated {#} m/(?<={#}/', # [perl #128170] '/\p{vertical tab}/' => 'Can\'t find Unicode property definition "vertical tab" {#} m/\\p{vertical tab}{#}/', # [perl #132055] - "/$bug133423/" => "Operand with no preceding operator {#} m/(?[(?^:(?[\\]))\\{#}]|2[^^]\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80])R.\\670/", + "/$bug133423/" => "Unexpected ']' with no following ')' in (?[... {#} m/(?[(?^:(?[\\]))\\]{#}|2[^^]\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80])R.\\670/", '/[^/' => 'Unmatched [ {#} m/[{#}^/', # [perl #133767] '/\p{Is_Other_Alphabetic=F}/ ' => 'Can\'t find Unicode property definition "Is_Other_Alphabetic=F" {#} m/\p{Is_Other_Alphabetic=F}{#}/', '/\p{Is_Other_Alphabetic=F}/ ' => 'Can\'t find Unicode property definition "Is_Other_Alphabetic=F" {#} m/\p{Is_Other_Alphabetic=F}{#}/', diff --git a/t/re/regex_sets.t b/t/re/regex_sets.t index fc089a9..ee79059 100644 --- a/t/re/regex_sets.t +++ b/t/re/regex_sets.t @@ -220,6 +220,12 @@ for my $char ("Ù ", "Ù¥", "Ù©") { qr/(?[\P{Is0}])/', qr/\QUnknown user-defined property name "Is0"/, {}, "[perl #133889]"); } +{ + my $s = qr/(?x:(?[ [ x ] ]))/; + like("x", qr/(?[ $s ])/ , "Modifier flags in interpolated set don't" + . " disrupt"); +} + done_testing(); 1; -- 1.8.3.1