X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/3071c56699e6a52355bb59a16e1d309f92d749e6..3e699fa01bec7dca385966fb2e0bd8150ad039b6:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 8c3dffd..91d91bc 100644 --- a/regcomp.c +++ b/regcomp.c @@ -86,8 +86,8 @@ EXTERN_C const struct regexp_engine my_reg_engine; # include "regcomp.h" #endif -#include "dquote_static.c" -#include "inline_invlist.c" +#include "dquote_inline.h" +#include "invlist_inline.h" #include "unicode_constants.h" #define HAS_NONLATIN1_FOLD_CLOSURE(i) \ @@ -214,6 +214,7 @@ struct RExC_state_t { #define RExC_mysv2 (pRExC_state->mysv2) #endif + bool seen_unfolded_sharp_s; }; #define RExC_flags (pRExC_state->flags) @@ -226,6 +227,17 @@ struct RExC_state_t { #define RExC_end (pRExC_state->end) #define RExC_parse (pRExC_state->parse) #define RExC_whilem_seen (pRExC_state->whilem_seen) + +/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any + * EXACTF node, hence was parsed under /di rules. If later in the parse, + * something forces the pattern into using /ui rules, the sharp s should be + * folded into the sequence 'ss', which takes up more space than previously + * calculated. This means that the sizing pass needs to be restarted. (The + * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node + * that gets converted to /ui (and EXACTFU) occupies the same amount of space, + * so there is no need to resize [perl #125990]. */ +#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s) + #ifdef RE_TRACK_PATTERN_OFFSETS #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */ @@ -299,7 +311,9 @@ struct RExC_state_t { #define SPSTART 0x04 /* Starts with * or + */ #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */ #define TRYAGAIN 0x10 /* Weeded out a declaration. */ -#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */ +#define RESTART_PASS1 0x20 /* Need to restart sizing pass */ +#define NEED_UTF8 0x40 /* In conjunction with RESTART_PASS1, need to + calcuate sizes as UTF-8 */ #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) @@ -318,12 +332,30 @@ struct RExC_state_t { #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren) #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren)) -#define REQUIRE_UTF8 STMT_START { \ +#define REQUIRE_UTF8(flagp) STMT_START { \ if (!UTF) { \ - *flagp = RESTART_UTF8; \ + assert(PASS1); \ + *flagp = RESTART_PASS1|NEED_UTF8; \ return NULL; \ } \ - } STMT_END + } STMT_END + +/* Change from /d into /u rules, and restart the parse if we've already seen + * something whose size would increase as a result, by setting *flagp and + * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates + * we've change to /u during the parse. */ +#define REQUIRE_UNI_RULES(flagp, restart_retval) \ + STMT_START { \ + if (DEPENDS_SEMANTICS) { \ + assert(PASS1); \ + set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \ + RExC_uni_semantics = 1; \ + if (RExC_seen_unfolded_sharp_s) { \ + *flagp |= RESTART_PASS1; \ + return restart_retval; \ + } \ + } \ + } STMT_END /* This converts the named class defined in regcomp.h to its equivalent class * number defined in handy.h. */ @@ -808,9 +840,6 @@ static const scan_data_t zero_scan_data = if (RExC_seen & REG_GPOS_SEEN) \ PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \ \ - if (RExC_seen & REG_CANY_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \ - \ if (RExC_seen & REG_RECURSE_SEEN) \ PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \ \ @@ -1180,7 +1209,9 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, /* If this can match all upper Latin1 code points, have to add them * as well */ - if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) { + if (OP(node) == ANYOFD + && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)) + { _invlist_union(invlist, PL_UpperLatin1, &invlist); } @@ -1258,12 +1289,19 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, * that should be; while the consequences for having /l bugs is * incorrect matches */ if (ssc_is_anything((regnode_ssc *)and_with)) { - anded_flags |= ANYOF_WARN_SUPER; + anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; } } else { anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); - anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; + if (OP(and_with) == ANYOFD) { + anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; + } + else { + anded_flags = ANYOF_FLAGS(and_with) + &( ANYOF_COMMON_FLAGS + |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER); + } } ANYOF_FLAGS(ssc) &= anded_flags; @@ -1414,6 +1452,11 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, else { ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; + if (OP(or_with) != ANYOFD) { + ored_flags + |= ANYOF_FLAGS(or_with) + & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; + } } ANYOF_FLAGS(ssc) |= ored_flags; @@ -1612,7 +1655,9 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) /* The code in this file assumes that all but these flags aren't relevant * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared * by the time we reach here */ - assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS)); + assert(! (ANYOF_FLAGS(ssc) + & ~( ANYOF_COMMON_FLAGS + |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))); populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); @@ -1626,6 +1671,10 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL; } + if (RExC_contains_locale) { + OP(ssc) = ANYOFL; + } + assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); } @@ -2004,7 +2053,7 @@ is the recommended Unicode-aware way of saying #define TRIE_STORE_REVCHAR(val) \ STMT_START { \ if (UTF) { \ - SV *zlopp = newSV(7); /* XXX: optimize me */ \ + SV *zlopp = newSV(UTF8_MAXBYTES); \ unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ SvCUR_set(zlopp, kapow - flrbbbbb); \ @@ -3655,6 +3704,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, * this function, we need to flag any occurrences of the sharp s. * This character forbids trie formation (because of added * complexity) */ +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) while (s < s_end) { if (*s == LATIN_SMALL_LETTER_SHARP_S) { OP(scan) = EXACTFA_NO_TRIE; @@ -3662,7 +3714,6 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, break; } s++; - continue; } } else { @@ -3708,6 +3759,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, *min_subtract += len - 1; s += len; } +#endif } } @@ -5069,7 +5121,6 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); #endif - case CANY: case SANY: if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ ssc_match_all_cp(data->start_class); @@ -5098,6 +5149,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", } break; + case ANYOFD: case ANYOFL: case ANYOF: if (flags & SCF_DO_STCLASS_AND) @@ -6180,6 +6232,7 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, ENTER; SAVETMPS; + save_re_context(); PUSHSTACKi(PERLSI_REQUIRE); /* G_RE_REPARSING causes the toker to collapse \\ into \ when * parsing qr''; normally only q'' does this. It also alters @@ -6546,7 +6599,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* ignore the utf8ness if the pattern is 0 length */ RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); + RExC_uni_semantics = 0; + RExC_seen_unfolded_sharp_s = 0; RExC_contains_locale = 0; RExC_contains_i = 0; RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT); @@ -6567,8 +6622,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, }); redo_first_pass: - /* we jump here if we upgrade the pattern to utf8 and have to - * recompile */ + /* we jump here if we have to recompile, e.g., from upgrading the pattern + * to utf8 */ if ((pm_flags & PMf_USE_RE_EVAL) /* this second condition covers the non-regex literal case, @@ -6602,7 +6657,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, if (rx_flags & PMf_FOLD) { RExC_contains_i = 1; } - if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { + if ( initial_charset == REGEX_DEPENDS_CHARSET + && (RExC_utf8 ||RExC_uni_semantics)) + { /* Set to use unicode semantics if the pattern is in utf8 and has the * 'depends' charset specified, as it means unicode when utf8 */ @@ -6688,9 +6745,16 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, at least some part of the pattern, and therefore must convert the whole thing. -- dmq */ - if (flags & RESTART_UTF8) { - S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, + if (flags & RESTART_PASS1) { + if (flags & NEED_UTF8) { + S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, pRExC_state->num_code_blocks); + } + else { + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + "Need to redo pass 1\n")); + } + goto redo_first_pass; } Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags); @@ -6757,7 +6821,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, for (n = 0; n < pRExC_state->num_code_blocks; n++) if (pRExC_state->code_blocks[n].src_regex) SAVEFREESV(pRExC_state->code_blocks[n].src_regex); - SAVEFREEPV(pRExC_state->code_blocks); + if(pRExC_state->code_blocks) + SAVEFREEPV(pRExC_state->code_blocks); /* often null */ } { @@ -6772,25 +6837,25 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, || ! has_charset); bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN) == REG_RUN_ON_COMMENT_SEEN); - U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) + U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD) >> RXf_PMf_STD_PMMOD_SHIFT); const char *fptr = STD_PAT_MODS; /*"msixn"*/ char *p; - /* Allocate for the worst case, which is all the std flags are turned - * on. If more precision is desired, we could do a population count of - * the flags set. This could be done with a small lookup table, or by - * shifting, masking and adding, or even, when available, assembly - * language for a machine-language population count. - * We never output a minus, as all those are defaults, so are + + /* We output all the necessary flags; we never output a minus, as all + * those are defaults, so are * covered by the caret */ const STRLEN wraplen = plen + has_p + has_runon + has_default /* If needs a caret */ + + PL_bitcount[reganch] /* 1 char for each set standard flag */ /* If needs a character set specifier */ + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0) - + (sizeof(STD_PAT_MODS) - 1) + (sizeof("(?:)") - 1); + /* make sure PL_bitcount bounds not exceeded */ + assert(sizeof(STD_PAT_MODS) <= 8); + Newx(p, wraplen + 1, char); /* +1 for the ending NUL */ r->xpv_len_u.xpvlenu_pv = p; if (RExC_utf8) @@ -7287,8 +7352,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, lookbehind */ if (pRExC_state->num_code_blocks) r->extflags |= RXf_EVAL_SEEN; - if (RExC_seen & REG_CANY_SEEN) - r->intflags |= PREGf_CANY_SEEN; if (RExC_seen & REG_VERBARG_SEEN) { r->intflags |= PREGf_VERBARG_SEEN; @@ -7319,7 +7382,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, * flags appropriately - Yves */ regnode *first = ri->program + 1; U8 fop = OP(first); - regnode *next = NEXTOPER(first); + regnode *next = regnext(first); U8 nop = OP(next); if (PL_regkind[fop] == NOTHING && nop == END) @@ -7333,13 +7396,13 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, r->extflags |= RXf_START_ONLY; else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE - && OP(regnext(first)) == END) + && nop == END) r->extflags |= RXf_WHITE; else if ( r->extflags & RXf_SPLIT && (fop == EXACT || fop == EXACTL) && STR_LEN(first) == 1 && *(STRING(first)) == ' ' - && OP(regnext(first)) == END ) + && nop == END ) r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); } @@ -7700,13 +7763,8 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, sv_setpvn(sv, s, i); TAINT_set(oldtainted); #endif - if ( (rx->intflags & PREGf_CANY_SEEN) - ? (RXp_MATCH_UTF8(rx) - && (!i || is_utf8_string((U8*)s, i))) - : (RXp_MATCH_UTF8(rx)) ) - { + if (RXp_MATCH_UTF8(rx)) SvUTF8_on(sv); - } else SvUTF8_off(sv); if (TAINTING_get) { @@ -7991,7 +8049,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) * Some of the methods should always be private to the implementation, and some * should eventually be made public */ -/* The header definitions are in F */ +/* The header definitions are in F */ PERL_STATIC_INLINE UV* S__invlist_array_init(SV* const invlist, const bool will_have_0) @@ -9008,7 +9066,13 @@ Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) } /* Here, can't just append things, create and return a new inversion list - * which is the union of this range and the existing inversion list */ + * which is the union of this range and the existing inversion list. (If + * the new range is well-behaved wrt to the old one, we could just insert + * it, doing a Move() down on the tail of the old one (potentially growing + * it first). But to determine that means we would have the extra + * (possibly throw-away) work of first finding where the new one goes and + * whether it disrupts (splits) an existing range, so it doesn't appear to + * me (khw) that it's worth it) */ range_invlist = _new_invlist(2); _append_range_to_invlist(range_invlist, start, end); @@ -9747,10 +9811,10 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) #endif /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets - flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan - needs to be restarted. - Otherwise would only return NULL if regbranch() returns NULL, which - cannot happen. */ + flags. Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan + needs to be restarted, or'd with NEED_UTF8 if the pattern needs to be + upgraded to UTF-8. Otherwise would only return NULL if regbranch() returns + NULL, which cannot happen. */ STATIC regnode * S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter. @@ -9796,9 +9860,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) STRLEN verb_len = 0; char *start_arg = NULL; unsigned char op = 0; - int argok = 1; - int internal_argval = 0; /* internal_argval is only useful if - !argok */ + int arg_required = 0; + int internal_argval = -1; /* if >-1 we are not allowed an argument*/ if (has_intervening_patws) { RExC_parse++; @@ -9840,14 +9903,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case 'F': /* (*FAIL) */ if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) { op = OPFAIL; - argok = 0; } break; case ':': /* (*:NAME) */ case 'M': /* (*MARK:NAME) */ if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) { op = MARKPOINT; - argok = -1; + arg_required = 1; } break; case 'P': /* (*PRUNE) */ @@ -9872,36 +9934,30 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) "Unknown verb pattern '%"UTF8f"'", UTF8fARG(UTF, verb_len, start_verb)); } - if ( argok ) { - if ( start_arg && internal_argval ) { - vFAIL3("Verb pattern '%.*s' may not have an argument", - verb_len, start_verb); - } else if ( argok < 0 && !start_arg ) { - vFAIL3("Verb pattern '%.*s' has a mandatory argument", - verb_len, start_verb); - } else { - ret = reganode(pRExC_state, op, internal_argval); - if ( ! internal_argval && ! SIZE_ONLY ) { - if (start_arg) { - SV *sv = newSVpvn( start_arg, - RExC_parse - start_arg); - ARG(ret) = add_data( pRExC_state, - STR_WITH_LEN("S")); - RExC_rxi->data->data[ARG(ret)]=(void*)sv; - ret->flags = 0; - } else { - ret->flags = 1; - } - } - } - if (!internal_argval) - RExC_seen |= REG_VERBARG_SEEN; - } else if ( start_arg ) { - vFAIL3("Verb pattern '%.*s' may not have an argument", - verb_len, start_verb); - } else { - ret = reg_node(pRExC_state, op); - } + if ( arg_required && !start_arg ) { + vFAIL3("Verb pattern '%.*s' has a mandatory argument", + verb_len, start_verb); + } + if (internal_argval == -1) { + ret = reganode(pRExC_state, op, 0); + } else { + ret = reg2Lanode(pRExC_state, op, 0, internal_argval); + } + RExC_seen |= REG_VERBARG_SEEN; + if ( ! SIZE_ONLY ) { + if (start_arg) { + SV *sv = newSVpvn( start_arg, + RExC_parse - start_arg); + ARG(ret) = add_data( pRExC_state, + STR_WITH_LEN("S")); + RExC_rxi->data->data[ARG(ret)]=(void*)sv; + ret->flags = 1; + } else { + ret->flags = 0; + } + if ( internal_argval != -1 ) + ARG2L_SET(ret, internal_argval); + } nextchar(pRExC_state); return ret; } @@ -10064,7 +10120,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) --RExC_parse; nextchar(pRExC_state); if (*RExC_parse == ')') { - ret=reg_node(pRExC_state, OPFAIL); + ret=reganode(pRExC_state, OPFAIL, 0); nextchar(pRExC_state); return ret; } @@ -10267,9 +10323,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) int is_define= 0; const int DEFINE_len = sizeof("DEFINE") - 1; if (RExC_parse[0] == '?') { /* (?(?...)) */ - if (RExC_parse[1] == '=' || RExC_parse[1] == '!' - || RExC_parse[1] == '<' - || RExC_parse[1] == '{') { /* Lookahead or eval. */ + if ( + RExC_parse[1] == '=' || + RExC_parse[1] == '!' || + RExC_parse[1] == '<' || + RExC_parse[1] == '{' + ) { /* Lookahead or eval. */ I32 flag; regnode *tail; @@ -10278,8 +10337,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret->flags = 1; tail = reg(pRExC_state, 1, &flag, depth+1); - if (flag & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flag & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flag & (RESTART_PASS1|NEED_UTF8); return NULL; } REGTAIL(pRExC_state, ret, tail); @@ -10343,7 +10402,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { /* (?(1)...) */ char c; - char *tmp; UV uv; if (grok_atoUV(RExC_parse, &uv, &endptr) && uv <= I32_MAX @@ -10351,24 +10409,23 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) parno = (I32)uv; RExC_parse = (char*)endptr; } - /* XXX else what? */ + else { + vFAIL("panic: grok_atoUV returned FALSE"); + } ret = reganode(pRExC_state, GROUPP, parno); insert_if_check_paren: - if (*(tmp = nextchar(pRExC_state)) != ')') { - /* nextchar also skips comments, so undo its work - * and skip over the the next character. - */ - RExC_parse = tmp; + if (UCHARAT(RExC_parse) != ')') { RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; vFAIL("Switch condition not recognized"); } + nextchar(pRExC_state); insert_if: REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); br = regbranch(pRExC_state, &flags, 1,depth+1); if (br == NULL) { - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", @@ -10376,7 +10433,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0)); - c = *nextchar(pRExC_state); + c = UCHARAT(RExC_parse); + nextchar(pRExC_state); if (flags&HASWIDTH) *flagp |= HASWIDTH; if (c == '|') { @@ -10387,8 +10445,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) lastbr = reganode(pRExC_state, IFTHEN, 0); if (!regbranch(pRExC_state, &flags, 1,depth+1)) { - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", @@ -10397,7 +10455,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) REGTAIL(pRExC_state, ret, lastbr); if (flags&HASWIDTH) *flagp |= HASWIDTH; - c = *nextchar(pRExC_state); + c = UCHARAT(RExC_parse); + nextchar(pRExC_state); } else lastbr = NULL; @@ -10485,8 +10544,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* branch_len = (paren != 0); */ if (br == NULL) { - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); @@ -10532,8 +10591,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) br = regbranch(pRExC_state, &flags, 0, depth+1); if (br == NULL) { - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); @@ -10664,12 +10723,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* Check for proper termination. */ if (paren) { - /* restore original flags, but keep (?p) */ + /* restore original flags, but keep (?p) and, if we've changed from /d + * rules to /u, keep the /u */ RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY); - if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') { + if (DEPENDS_SEMANTICS && RExC_uni_semantics) { + set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); + } + if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') { RExC_parse = oregcomp_parse; vFAIL("Unmatched ("); } + nextchar(pRExC_state); } else if (!paren && RExC_parse < RExC_end) { if (*RExC_parse == ')') { @@ -10694,8 +10758,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) * * Implements the concatenation operator. * - * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be - * restarted. + * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be + * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8 */ STATIC regnode * S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) @@ -10734,8 +10798,8 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) if (latest == NULL) { if (flags & TRYAGAIN) continue; - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags); @@ -10777,8 +10841,8 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) * * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with * TRYAGAIN. - * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be - * restarted. + * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be + * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8 */ STATIC regnode * S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) @@ -10807,8 +10871,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = regatom(pRExC_state, &flags,depth+1); if (ret == NULL) { - if (flags & (TRYAGAIN|RESTART_UTF8)) - *flagp |= flags & (TRYAGAIN|RESTART_UTF8); + if (flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8)) + *flagp |= flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8); else FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags); return(NULL); @@ -10874,25 +10938,36 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); RExC_emit = orig_emit; } - ret = reg_node(pRExC_state, OPFAIL); + ret = reganode(pRExC_state, OPFAIL, 0); return ret; } - else if (min == max - && RExC_parse < RExC_end - && (*RExC_parse == '?' || *RExC_parse == '+')) + else if (min == max && RExC_parse < RExC_end && *RExC_parse == '?') { if (PASS2) { ckWARN2reg(RExC_parse + 1, "Useless use of greediness modifier '%c'", *RExC_parse); } - /* Absorb the modifier, so later code doesn't see nor use - * it */ + /* Absorb the modifier, so later code doesn't see nor use it */ nextchar(pRExC_state); } do_curly: if ((flags&SIMPLE)) { + if (min == 0 && max == REG_INFTY) { + reginsert(pRExC_state, STAR, ret, depth+1); + ret->flags = 0; + MARK_NAUGHTY(4); + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; + goto nest_check; + } + if (min == 1 && max == REG_INFTY) { + reginsert(pRExC_state, PLUS, ret, depth+1); + ret->flags = 0; + MARK_NAUGHTY(3); + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; + goto nest_check; + } MARK_NAUGHTY_EXP(2, 2); reginsert(pRExC_state, CURLY, ret, depth+1); Set_Node_Offset(ret, parse_start+1); /* MJD */ @@ -10966,22 +11041,10 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); - if (op == '*' && (flags&SIMPLE)) { - reginsert(pRExC_state, STAR, ret, depth+1); - ret->flags = 0; - MARK_NAUGHTY(4); - RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; - } - else if (op == '*') { + if (op == '*') { min = 0; goto do_curly; } - else if (op == '+' && (flags&SIMPLE)) { - reginsert(pRExC_state, PLUS, ret, depth+1); - ret->flags = 0; - MARK_NAUGHTY(3); - RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; - } else if (op == '+') { min = 1; goto do_curly; @@ -11085,14 +11148,17 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, * sequence. *node_p * will be set to a generated node returned by this * function calling S_reg(). * - * The final possibility, which happens only when the fourth one would - * otherwise be in effect, is that one of those code points requires the - * pattern to be recompiled as UTF-8. The function returns FALSE, and sets - * the RESTART_UTF8 flag in *flagp. When this happens, the caller needs to - * desist from continuing parsing, and return this information to its caller. - * This is not set for when there is only one code point, as this can be - * called as part of an ANYOF node, and they can store above-Latin1 code - * points without the pattern having to be in UTF-8. + * The final possibility is that it is premature to be calling this function; + * that pass1 needs to be restarted. This can happen when this changes from + * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The + * latter occurs only when the fourth possibility would otherwise be in + * effect, and is because one of those code points requires the pattern to be + * recompiled as UTF-8. The function returns FALSE, and sets the + * RESTART_PASS1 and NEED_UTF8 flags in *flagp, as appropriate. When this + * happens, the caller needs to desist from continuing parsing, and return + * this information to its caller. This is not set for when there is only one + * code point, as this can be called as part of an ANYOF node, and they can + * store above-Latin1 code points without the pattern having to be in UTF-8. * * For non-single-quoted regexes, the tokenizer has resolved character and * sequence names inside \N{...} into their Unicode values, normalizing the @@ -11107,8 +11173,6 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, * parser. But if the single-quoted regex is something like '\N{U+41}', that * is legal and handled here. The code point is Unicode, and has to be * translated into the native character set for non-ASCII platforms. - * the tokenizer passes the \N sequence through unchanged; this code will not - * attempt to determine this nor expand those, instead raising a syntax error. */ char * endbrace; /* points to '}' following the name */ @@ -11139,7 +11203,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, /* Disambiguate between \N meaning a named character versus \N meaning * [^\n]. The latter is assumed when the {...} following the \N is a legal - * quantifier, or there is no a '{' at all */ + * quantifier, or there is no '{' at all */ if (*p != '{' || regcurly(p)) { RExC_parse = p; if (cp_count) { @@ -11181,7 +11245,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, vFAIL("\\N{NAME} must be resolved by the lexer"); } - RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */ + REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode + semantics */ if (endbrace == RExC_parse) { /* empty: \N{} */ if (cp_count) { @@ -11321,8 +11386,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, if (node_p) { if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) { - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); return FALSE; } FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"", @@ -11509,8 +11574,13 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, *character = (U8) code_point; len = 1; } /* Else is folded non-UTF8 */ +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { - +#else + else if (1) { +#endif /* We don't fold any non-UTF8 except possibly the Sharp s (see * comments at join_exact()); */ *character = (U8) code_point; @@ -11553,10 +11623,14 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, /* A single character node is SIMPLE, except for the special-cased SHARP S * under /di. */ - if ((len == 1 || (UTF && len == UNISKIP(code_point))) - && (code_point != LATIN_SMALL_LETTER_SHARP_S - || ! FOLD || ! DEPENDS_SEMANTICS)) - { + if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point))) +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) + && ( code_point != LATIN_SMALL_LETTER_SHARP_S + || ! FOLD || ! DEPENDS_SEMANTICS) +#endif + ) { *flagp |= SIMPLE; } @@ -11642,8 +11716,8 @@ S_backref_value(char *p) Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with TRYAGAIN. - Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be - restarted. + Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be + restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8 Otherwise does not return NULL. */ @@ -11704,17 +11778,18 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) TRUE, /* allow multi-char folds */ FALSE, /* don't silence non-portable warnings. */ (bool) RExC_strict, + TRUE, /* Allow an optimized regnode result */ NULL); - if (*RExC_parse != ']') { - RExC_parse = oregcomp_parse; - vFAIL("Unmatched ["); - } if (ret == NULL) { - if (*flagp & RESTART_UTF8) + if (*flagp & (RESTART_PASS1|NEED_UTF8)) return NULL; FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", (UV) *flagp); } + if (*RExC_parse != ']') { + RExC_parse = oregcomp_parse; + vFAIL("Unmatched ["); + } nextchar(pRExC_state); Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ break; @@ -11731,8 +11806,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } goto tryagain; } - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", @@ -11806,13 +11881,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) RExC_seen_zerolen++; /* Do not optimize RE away */ goto finish_meta_pat; case 'C': - ret = reg_node(pRExC_state, CANY); - RExC_seen |= REG_CANY_SEEN; - *flagp |= HASWIDTH|SIMPLE; - if (PASS2) { - ckWARNdep(RExC_parse+1, "\\C is deprecated"); - } - goto finish_meta_pat; + vFAIL("\\C no longer supported"); case 'X': ret = reg_node(pRExC_state, CLUMP); *flagp |= HASWIDTH; @@ -11902,7 +11971,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) NOT_REACHED; /*NOTREACHED*/ } RExC_parse = endbrace; - RExC_uni_semantics = 1; + REQUIRE_UNI_RULES(flagp, NULL); if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */ OP(ret) = BOUNDU; @@ -12012,9 +12081,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) It would be a bug if these returned non-portables */ (bool) RExC_strict, + TRUE, /* Allow an optimized regnode result */ NULL); - /* regclass() can only return RESTART_UTF8 if multi-char folds - are allowed. */ + if (*flagp & RESTART_PASS1) + return NULL; + /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if + * multi-char folds are allowed. */ if (!ret) FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", (UV) *flagp); @@ -12051,9 +12123,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) break; } - if (*flagp & RESTART_UTF8) + if (*flagp & RESTART_PASS1) return NULL; RExC_parse--; + + /* Here, evaluates to a single code point. Go get that */ goto defchar; case 'k': /* Handle \k and \k'NAME' */ @@ -12224,7 +12298,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) back into the quick-grab loop below */ parse_start--; goto defchar; - } + } /* end of switch on a \foo sequence */ break; case '#': @@ -12242,6 +12316,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) RExC_parse++; defchar: { + + /* Here, we have determined that the next thing is probably a + * literal character. (It still may be an escape sequence that + * evaluates to a single character) */ + STRLEN len = 0; UV ender = 0; char *p; @@ -12259,7 +12338,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * string's UTF8ness. The reason to do this is that EXACTF is not * trie-able, EXACTFU is. * - * Similarly, we can convert EXACTFL nodes to EXACTFU if they + * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they * contain only above-Latin1 characters (hence must be in UTF8), * which don't participate in folds with Latin1-range characters, * as the latter's folds aren't known until runtime. (We don't @@ -12282,8 +12361,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reparse: - /* We do the EXACTFish to EXACT node only if folding. (And we - * don't need to figure this out until pass 2) */ + /* We look for the EXACTFish to EXACT node optimizaton only if + * folding. (And we don't need to figure this out until pass 2) */ maybe_exact = FOLD && PASS2; /* XXX The node can hold up to 255 bytes, yet this only goes to @@ -12376,8 +12455,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) flagp, depth) ) { - if (*flagp & RESTART_UTF8) - FAIL("panic: grok_bslash_N set RESTART_UTF8"); + if (*flagp & NEED_UTF8) + FAIL("panic: grok_bslash_N set NEED_UTF8"); + if (*flagp & RESTART_PASS1) + return NULL; /* Here, it wasn't a single code point. Go close * up this EXACTish node. The switch() prior to @@ -12387,7 +12468,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } p = RExC_parse; if (ender > 0xff) { - REQUIRE_UTF8; + REQUIRE_UTF8(flagp); } break; case 'r': @@ -12434,7 +12515,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto recode_encoding; } if (ender > 0xff) { - REQUIRE_UTF8; + REQUIRE_UTF8(flagp); } break; } @@ -12472,7 +12553,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } } else { - REQUIRE_UTF8; + REQUIRE_UTF8(flagp); } break; } @@ -12517,7 +12598,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) STRLEN numlen = 3; ender = grok_oct(p, &numlen, &flags, NULL); if (ender > 0xff) { - REQUIRE_UTF8; + REQUIRE_UTF8(flagp); } p += numlen; if (PASS2 /* like \08, \178 */ @@ -12539,7 +12620,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ender = reg_recode((const char)(U8)ender, &enc); if (!enc && PASS2) ckWARNreg(p, "Invalid escape in the specified encoding"); - REQUIRE_UTF8; + REQUIRE_UTF8(flagp); } break; case '\0': @@ -12548,7 +12629,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* FALLTHROUGH */ default: if (!SIZE_ONLY&& isALPHANUMERIC(*p)) { - /* Include any { following the alpha to emphasize + /* Include any left brace following the alpha to emphasize * that it could be part of an escape at some point * in the future */ int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1; @@ -12560,7 +12641,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) case '{': /* Currently we don't warn when the lbrace is at the start * of a construct. This catches it in the middle of a - * literal string, or when its the first thing after + * literal string, or when it's the first thing after * something like "\b" */ if (! SIZE_ONLY && (len || (p > RExC_start && isALPHA_A(*(p -1))))) @@ -12570,7 +12651,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /*FALLTHROUGH*/ default: /* A literal character */ normal_default: - if (UTF8_IS_START(*p) && UTF) { + if (! UTF8_IS_INVARIANT(*p) && UTF) { STRLEN numlen; ender = utf8n_to_uvchr((U8*)p, RExC_end - p, &numlen, UTF8_ALLOW_DEFAULT); @@ -12582,9 +12663,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } /* End of switch on the literal */ /* Here, have looked at the literal character and - * contains its ordinal,

points to the character after it - */ - + * contains its ordinal,

points to the character after it. + * We need to check if the next non-ignored thing is a + * quantifier. Move

to after anything that should be + * ignored, which, as a side effect, positions

for the next + * loop iteration */ if ( RExC_flags & RXf_PMf_EXTENDED) p = regpatws(pRExC_state, p, TRUE); /* means recognize comments */ @@ -12602,6 +12685,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto loopdone; } + /* Ready to add 'ender' to the node */ + if (! FOLD) { /* The simple case, just append the literal */ /* In the sizing pass, we need only the size of the @@ -12609,7 +12694,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * its representation until PASS2. */ if (SIZE_ONLY) { if (UTF) { - const STRLEN unilen = UNISKIP(ender); + const STRLEN unilen = UVCHR_SKIP(ender); s += unilen; /* We have to subtract 1 just below (and again in @@ -12660,11 +12745,15 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } else /* A regular FOLD code point */ if (! ( UTF +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) /* See comments for join_exact() as to why we fold this * non-UTF at compile time */ || (node_type == EXACTFU - && ender == LATIN_SMALL_LETTER_SHARP_S))) - { + && ender == LATIN_SMALL_LETTER_SHARP_S) +#endif + )) { /* Here, are folding and are not UTF-8 encoded; therefore * the character must be in the range 0-255, and is not /l * (Not /l because we already handled these under /l in @@ -12675,13 +12764,20 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* See if the character's fold differs between /d and * /u. This includes the multi-char fold SHARP S to * 'ss' */ - if (maybe_exactfu + if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) { + RExC_seen_unfolded_sharp_s = 1; + maybe_exactfu = FALSE; + } + else if (maybe_exactfu && (PL_fold[ender] != PL_fold_latin1[ender] - || ender == LATIN_SMALL_LETTER_SHARP_S - || (len > 0 - && isALPHA_FOLD_EQ(ender, 's') - && isALPHA_FOLD_EQ(*(s-1), 's')))) - { +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) + || ( len > 0 + && isALPHA_FOLD_EQ(ender, 's') + && isALPHA_FOLD_EQ(*(s-1), 's')) +#endif + )) { maybe_exactfu = FALSE; } } @@ -12690,7 +12786,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * we have an array that finds its fold quickly */ *(s++) = (char) ender; } - else { /* FOLD and UTF */ + else { /* FOLD, and UTF (or sharp s) */ /* Unlike the non-fold case, we do actually have to * calculate the results here in pass 1. This is for two * reasons, the folded length may be longer than the @@ -12820,7 +12916,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { - if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( + if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE( *s, *(s+1)))) { break; @@ -13319,17 +13415,20 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, this function */ const bool save_fold = FOLD; /* Temporary */ char *save_end, *save_parse; /* Temporaries */ + const bool in_locale = LOC; /* we turn off /l during processing */ GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_HANDLE_REGEX_SETS; - if (LOC) { /* XXX could make valid in UTF-8 locales */ - vFAIL("(?[...]) not valid in locale"); + if (in_locale) { + set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); } - RExC_uni_semantics = 1; /* The use of this operator implies /u. This - is required so that the compile time values - are valid in all runtime cases */ + + REQUIRE_UNI_RULES(flagp, NULL); /* The use of this operator implies /u. + This is required so that the compile + time values are valid in all runtime + cases */ /* This will return only an ANYOF regnode, or (unlikely) something smaller * (such as EXACT). Thus we can skip most everything if just sizing. We @@ -13373,8 +13472,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, RExC_parse++; } - /* regclass() can only return RESTART_UTF8 if multi-char - folds are allowed. */ + /* regclass() can only return RESTART_PASS1 and NEED_UTF8 + * if multi-char folds are allowed. */ if (!regclass(pRExC_state, flagp,depth+1, is_posix_class, /* parse the whole char class only if not a @@ -13382,6 +13481,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, FALSE, /* don't allow multi-char folds */ TRUE, /* silence non-portable warnings. */ TRUE, /* strict */ + FALSE, /* Require return to be an ANYOF */ ¤t )) FAIL2("panic: regclass returned NULL to handle_sets, " @@ -13408,6 +13508,10 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, nextchar(pRExC_state); Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ + if (in_locale) { + set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET); + } + return node; } goto no_close; @@ -13437,12 +13541,13 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, * a stack. Each entry on the stack is a single character representing one * of the operators; or else a pointer to an operand inversion list. */ -#define IS_OPERAND(a) (! SvIOK(a)) +#define IS_OPERATOR(a) SvIOK(a) +#define IS_OPERAND(a) (! IS_OPERATOR(a)) /* The stack is kept in Łukasiewicz order. (That's pronounced similar * to luke-a-shave-itch (or -itz), but people who didn't want to bother - * with prounouncing it called it Reverse Polish instead, but now that YOU - * know how to prounounce it you can use the correct term, thus giving due + * with pronouncing it called it Reverse Polish instead, but now that YOU + * know how to pronounce it you can use the correct term, thus giving due * credit to the person who invented it, and impressing your geek friends. * Wikipedia says that the pronounciation of "Ł" has been changing so that * it is now more like an English initial W (as in wonk) than an L.) @@ -13609,13 +13714,14 @@ redo_curchar: /* If the top entry on the stack is an operator, it had * better be a '!', otherwise the entry below the top * operand should be an operator */ - if ( ! (top_ptr = av_fetch(stack, top_index, FALSE)) - || (! IS_OPERAND(*top_ptr) && SvUV(*top_ptr) != '!') - || top_index - fence < 1 - || ! (stacked_ptr = av_fetch(stack, - top_index - 1, - FALSE)) - || IS_OPERAND(*stacked_ptr)) + if ( ! (top_ptr = av_fetch(stack, top_index, FALSE)) + || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!') + || ( IS_OPERAND(*top_ptr) + && ( top_index - fence < 1 + || ! (stacked_ptr = av_fetch(stack, + top_index - 1, + FALSE)) + || ! IS_OPERATOR(*stacked_ptr)))) { RExC_parse++; vFAIL("Unexpected '(' with no preceding operator"); @@ -13628,13 +13734,14 @@ redo_curchar: break; case '\\': - /* regclass() can only return RESTART_UTF8 if multi-char - folds are allowed. */ + /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if + * multi-char folds are allowed. */ if (!regclass(pRExC_state, flagp,depth+1, TRUE, /* means parse just the next thing */ FALSE, /* don't allow multi-char folds */ FALSE, /* don't silence non-portable warnings. */ TRUE, /* strict */ + FALSE, /* Require return to be an ANYOF */ ¤t)) { FAIL2("panic: regclass returned NULL to handle_sets, " @@ -13654,14 +13761,15 @@ redo_curchar: RExC_parse++; } - /* regclass() can only return RESTART_UTF8 if multi-char - folds are allowed. */ + /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if + * multi-char folds are allowed. */ if(!regclass(pRExC_state, flagp,depth+1, is_posix_class, /* parse the whole char class only if not a posix class */ FALSE, /* don't allow multi-char folds */ FALSE, /* don't silence non-portable warnings. */ TRUE, /* strict */ + FALSE, /* Require return to be an ANYOF */ ¤t )) { @@ -13862,7 +13970,7 @@ redo_curchar: * be an operator */ top_ptr = av_fetch(stack, top_index, FALSE); assert(top_ptr); - if (! IS_OPERAND(*top_ptr)) { + if (IS_OPERATOR(*top_ptr)) { /* The only permissible operator at the top of the stack is * '!', which is applied immediately to this operand. */ @@ -13914,6 +14022,7 @@ redo_curchar: if (av_tindex(stack) < 0 /* Was empty */ || ((final = av_pop(stack)) == NULL) || ! IS_OPERAND(final) + || SvTYPE(final) != SVt_INVLIST || av_tindex(stack) >= 0) /* More left on stack */ { SvREFCNT_dec(final); @@ -13952,8 +14061,8 @@ redo_curchar: * already has all folding taken into consideration, and we don't want * regclass() to add to that */ RExC_flags &= ~RXf_PMf_FOLD; - /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed. - */ + /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if multi-char + * folds are allowed. */ node = regclass(pRExC_state, flagp,depth+1, FALSE, /* means parse the whole char class */ FALSE, /* don't allow multi-char folds */ @@ -13961,14 +14070,42 @@ redo_curchar: well have generated non-portable code points, but they're valid on this machine */ FALSE, /* similarly, no need for strict */ + FALSE, /* Require return to be an ANYOF */ NULL ); if (!node) FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf, PTR2UV(flagp)); + + /* Fix up the node type if we are in locale. (We have pretended we are + * under /u for the purposes of regclass(), as this construct will only + * work under UTF-8 locales. But now we change the opcode to be ANYOFL (so + * as to cause any warnings about bad locales to be output in regexec.c), + * and add the flag that indicates to check if not in a UTF-8 locale. The + * reason we above forbid optimization into something other than an ANYOF + * node is simply to minimize the number of code changes in regexec.c. + * Otherwise we would have to create new EXACTish node types and deal with + * them. This decision could be revisited should this construct become + * popular. + * + * (One might think we could look at the resulting ANYOF node and suppress + * the flag if everything is above 255, as those would be UTF-8 only, + * but this isn't true, as the components that led to that result could + * have been locale-affected, and just happen to cancel each other out + * under UTF-8 locales.) */ + if (in_locale) { + set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET); + + assert(OP(node) == ANYOF); + + OP(node) = ANYOFL; + ANYOF_FLAGS(node) |= ANYOF_LOC_REQ_UTF8; + } + if (save_fold) { RExC_flags |= RXf_PMf_FOLD; } + RExC_parse = save_parse + 1; RExC_end = save_end; SvREFCNT_dec_NN(final); @@ -13978,6 +14115,7 @@ redo_curchar: Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ return node; } +#undef IS_OPERATOR #undef IS_OPERAND STATIC void @@ -14024,9 +14162,30 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); break; + +#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */ + case LATIN_SMALL_LETTER_SHARP_S: *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S); break; + +#endif + +#if UNICODE_MAJOR_VERSION < 3 \ + || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0) + + /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did + * U+0131. */ + case 'i': + case 'I': + *invlist = + add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); +# if UNICODE_DOT_DOT_VERSION == 1 + *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I); +# endif + break; +#endif + default: /* Use deprecated warning to increase the chances of this being * output */ @@ -14099,6 +14258,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, about too large characters */ const bool strict, + bool optimizable, /* ? Allow a non-ANYOF return + node */ SV** ret_invlist /* Return an inversion list, not a node */ ) { @@ -14123,8 +14284,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * are extra bits for \w, etc. in locale ANYOFs, as what these match is not * determinable at compile time * - * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs - * to be restarted. This can only happen if ret_invlist is non-NULL. + * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs + * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded + * to UTF-8. This can only happen if ret_invlist is non-NULL. */ UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE; @@ -14219,11 +14381,19 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, DEBUG_PARSE("clas"); +#if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \ + && UNICODE_DOT_DOT_VERSION == 0) + allow_multi_folds = FALSE; +#endif + /* Assume we are going to generate an ANYOF node. */ ret = reganode(pRExC_state, (LOC) ? ANYOFL - : ANYOF, + : (DEPENDS_SEMANTICS) + ? ANYOFD + : ANYOF, 0); if (SIZE_ONLY) { @@ -14370,8 +14540,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, depth) ) { - if (*flagp & RESTART_UTF8) - FAIL("panic: grok_bslash_N set RESTART_UTF8"); + if (*flagp & NEED_UTF8) + FAIL("panic: grok_bslash_N set NEED_UTF8"); + if (*flagp & RESTART_PASS1) + return NULL; if (cp_count < 0) { vFAIL("\\N in a character class must be a named character: \\N{...}"); @@ -14535,6 +14707,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, (value == 'p' ? '+' : '!'), UTF8fARG(UTF, n, name)); has_user_defined_property = TRUE; + optimizable = FALSE; /* Will have to leave this an + ANYOF node */ /* We don't know yet, so have to assume that the * property could match something in the Latin1 range, @@ -14592,7 +14766,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, named */ /* \p means they want Unicode semantics */ - RExC_uni_semantics = 1; + REQUIRE_UNI_RULES(flagp, NULL); } break; case 'n': value = '\n'; break; @@ -14769,6 +14943,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL; ANYOF_POSIXL_ZERO(ret); + + /* We can't change this into some other type of node + * (unless this is the only element, in which case there + * are nodes that mean exactly this) as has runtime + * dependencies */ + optimizable = FALSE; } /* Coverity thinks it is possible for this to be negative; both @@ -14978,7 +15158,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* non-Latin1 code point implies unicode semantics. Must be set in * pass1 so is there for the whole of pass 2 */ if (value > 255) { - RExC_uni_semantics = 1; + REQUIRE_UNI_RULES(flagp, NULL); } /* Ready to process either the single value, or the completed range. @@ -15260,7 +15440,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, ret = reg(pRExC_state, 1, ®_flags, depth+1); - *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8); + *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8); RExC_parse = save_parse; RExC_end = save_end; @@ -15280,8 +15460,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * 2) if the character class contains only a single element (including a * single range), we see if there is an equivalent node for it. * Other checks are possible */ - if (! ret_invlist /* Can't optimize if returning the constructed - inversion list */ + if ( optimizable + && ! ret_invlist /* Can't optimize if returning the constructed + inversion list */ && (UNLIKELY(posixl_matches_all) || element_count == 1)) { U8 op = END; @@ -15394,10 +15575,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, op = POSIXA; } } - else if (AT_LEAST_ASCII_RESTRICTED || ! FOLD) { + else if (! FOLD || ASCII_FOLD_RESTRICTED) { /* We can optimize A-Z or a-z, but not if they could match - * something like the KELVIN SIGN under /i (/a means they - * can't) */ + * something like the KELVIN SIGN under /i. */ if (prevvalue == 'A') { if (value == 'Z' #ifdef EBCDIC @@ -15493,7 +15673,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* Our calculated list will be for Unicode rules. For locale * matching, we have to keep a separate list that is consulted at * runtime only when the locale indicates Unicode rules. For - * non-locale, we just use to the general list */ + * non-locale, we just use the general list */ if (LOC) { use_list = &only_utf8_locale_list; } @@ -15643,7 +15823,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (DEPENDS_SEMANTICS) { /* Under /d, everything in the upper half of the Latin1 range * matches these complements */ - ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII; + ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; } else if (AT_LEAST_ASCII_RESTRICTED) { /* Under /a and /aa, everything above ASCII matches these @@ -15730,7 +15910,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } if (warn_super) { - ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER; + ANYOF_FLAGS(ret) + |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; + + /* Because an ANYOF node is the only one that warns, this node + * can't be optimized into something else */ + optimizable = FALSE; } } @@ -15752,8 +15937,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (only_utf8_locale_list) { ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; } - else if (cp_list) { /* Look to see if there a 0-255 code point is in - the list */ + else if (cp_list) { /* Look to see if a 0-255 code point is in list */ UV start, end; invlist_iterinit(cp_list); if (invlist_iternext(cp_list, &start, &end) && start < 256) { @@ -15812,21 +15996,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * adjacent such nodes. And if the class is equivalent to things like /./, * expensive run-time swashes can be avoided. Now that we have more * complete information, we can find things necessarily missed by the - * earlier code. I (khw) am not sure how much to look for here. It would - * be easy, but perhaps too slow, to check any candidates against all the - * node types they could possibly match using _invlistEQ(). */ - - if (cp_list - && ! invert - && ! depends_list - && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION - - /* We don't optimize if we are supposed to make sure all non-Unicode - * code points raise a warning, as only ANYOF nodes have this check. - * */ - && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER)) - { + * earlier code. I (khw) did some benchmarks and found essentially no + * speed difference between using a POSIXA node versus an ANYOF node, so + * there is no reason to optimize, for example [A-Za-z0-9_] into + * [[:word:]]/a (although if we did it in the sizing pass it would save + * space). _invlistEQ() could be used if one ever wanted to do something + * like this at this point in the code */ + + if (optimizable && cp_list && ! invert && ! depends_list) { UV start, end; U8 op = END; /* The optimzation node-type */ const char * cur_parse= RExC_parse; @@ -15835,9 +16012,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (! invlist_iternext(cp_list, &start, &end)) { /* Here, the list is empty. This happens, for example, when a - * Unicode property is the only thing in the character class, and - * it doesn't match anything. (perluniprops.pod notes such - * properties) */ + * Unicode property that doesn't match anything is the only element + * in the character class (perluniprops.pod notes such properties). + * */ op = OPFAIL; *flagp |= HASWIDTH|SIMPLE; } @@ -15893,7 +16070,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } } } - } + } /* End of first range contains just a single code point */ else if (start == 0) { if (end == UV_MAX) { op = SANY; @@ -15915,7 +16092,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, RExC_parse = (char *)orig_parse; RExC_emit = (regnode *)orig_emit; - ret = reg_node(pRExC_state, op); + if (regarglen[op]) { + ret = reganode(pRExC_state, op, 0); + } else { + ret = reg_node(pRExC_state, op); + } RExC_parse = (char *)cur_parse; @@ -16224,50 +16405,70 @@ S_reg_skipcomment(RExC_state_t *pRExC_state, char* p) return p; } -/* nextchar() - - Advances the parse position, and optionally absorbs - "whitespace" from the inputstream. - - Without /x "whitespace" means (?#...) style comments only, - with /x this means (?#...) and # comments and whitespace proper. - - Returns the RExC_parse point from BEFORE the scan occurs. - - This is the /x friendly way of saying RExC_parse++. -*/ - -STATIC char* -S_nextchar(pTHX_ RExC_state_t *pRExC_state) +STATIC void +S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state, + char ** p, + const bool force_to_xmod + ) { - char* const retval = RExC_parse++; + /* If the text at the current parse position '*p' is a '(?#...)' comment, + * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p' + * is /x whitespace, advance '*p' so that on exit it points to the first + * byte past all such white space and comments */ - PERL_ARGS_ASSERT_NEXTCHAR; + const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED); + + PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT; for (;;) { - if (RExC_end - RExC_parse >= 3 - && *RExC_parse == '(' - && RExC_parse[1] == '?' - && RExC_parse[2] == '#') + if (RExC_end - (*p) >= 3 + && *(*p) == '(' + && *(*p + 1) == '?' + && *(*p + 2) == '#') { - while (*RExC_parse != ')') { - if (RExC_parse == RExC_end) + while (*(*p) != ')') { + if ((*p) == RExC_end) FAIL("Sequence (?#... not terminated"); - RExC_parse++; + (*p)++; } - RExC_parse++; + (*p)++; continue; } - if (RExC_flags & RXf_PMf_EXTENDED) { - char * p = regpatws(pRExC_state, RExC_parse, - TRUE); /* means recognize comments */ - if (p != RExC_parse) { - RExC_parse = p; + + if (use_xmod) { + char * new_p = regpatws(pRExC_state, *p, + TRUE); /* means recognize comments */ + if (new_p != *p) { + *p = new_p; continue; } } - return retval; + + break; } + + return; +} + +/* nextchar() + + Advances the parse position by one byte, unless that byte is the beginning + of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In + those two cases, the parse position is advanced beyond all such comments and + white space. + + This is the (?#...) and /x friendly way of saying RExC_parse++. +*/ + +STATIC void +S_nextchar(pTHX_ RExC_state_t *pRExC_state) +{ + PERL_ARGS_ASSERT_NEXTCHAR; + + RExC_parse++; + + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + FALSE /* Don't assume /x */ ); } STATIC regnode * @@ -16756,7 +16957,7 @@ Perl_regdump(pTHX_ const regexp *r) PerlIO_printf(Perl_debug_log, "(SBOL)"); if (r->intflags & PREGf_ANCH_GPOS) PerlIO_printf(Perl_debug_log, "(GPOS)"); - PerlIO_putc(Perl_debug_log, ' '); + (void)PerlIO_putc(Perl_debug_log, ' '); } if (r->intflags & PREGf_GPOS_SEEN) PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); @@ -16895,9 +17096,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ } } else if (k == CURLY) { + U32 lo = ARG1(o), hi = ARG2(o); if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ - Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o)); + Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo); + if (hi == REG_INFTY) + sv_catpvs(sv, "INFTY"); + else + Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi); + sv_catpvs(sv, "}"); } else if (k == WHILEM && o->flags) /* Ordinal/of */ Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); @@ -16905,7 +17112,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ || k == GROUPP || OP(o)==ACCEPT) { AV *name_list= NULL; - Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ + U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o); + Perl_sv_catpvf(aTHX_ sv, "%"UVuf, (UV)parno); /* Parenth number */ if ( RXp_PAREN_NAMES(prog) ) { name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); } else if ( pRExC_state ) { @@ -16913,12 +17121,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ } if (name_list) { if ( k != REF || (OP(o) < NREF)) { - SV **name= av_fetch(name_list, ARG(o), 0 ); + SV **name= av_fetch(name_list, parno, 0 ); if (name) Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); } else { - SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]); + SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]); I32 *nums=(I32*)SvPVX(sv_dat); SV **name= av_fetch(name_list, nums[0], 0 ); I32 n; @@ -16961,11 +17169,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); } } - else if (k == VERB) { - if (!o->flags) - Perl_sv_catpvf(aTHX_ sv, ":%"SVf, - SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); - } else if (k == LOGICAL) + else if (k == LOGICAL) /* 2: embedded, otherwise 1 */ Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); else if (k == ANYOF) { @@ -16974,8 +17178,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ SV* bitmap_invlist; /* Will hold what the bit map contains */ - if (OP(o) == ANYOFL) - sv_catpvs(sv, "{loc}"); + if (OP(o) == ANYOFL) { + if (flags & ANYOF_LOC_REQ_UTF8) { + sv_catpvs(sv, "{utf8-loc}"); + } + else { + sv_catpvs(sv, "{loc}"); + } + } if (flags & ANYOF_LOC_FOLD) sv_catpvs(sv, "{i}"); Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); @@ -17011,7 +17221,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ sv_catpvs(sv, "^"); } - if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) { + if (OP(o) == ANYOFD + && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)) + { sv_catpvs(sv, "{non-utf8-latin1-all}"); } @@ -17136,6 +17348,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); else if (OP(o) == SBOL) Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^"); + + /* add on the verb argument if there is one */ + if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) { + Perl_sv_catpvf(aTHX_ sv, ":%"SVf, + SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); + } #else PERL_UNUSED_CONTEXT; PERL_UNUSED_ARG(sv); @@ -17710,6 +17928,47 @@ S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...) Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf)); } +/* XXX Here's a total kludge. But we need to re-enter for swash routines. */ + +#ifndef PERL_IN_XSUB_RE +void +Perl_save_re_context(pTHX) +{ + I32 nparens = -1; + I32 i; + + /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */ + + if (PL_curpm) { + const REGEXP * const rx = PM_GETRE(PL_curpm); + if (rx) + nparens = RX_NPARENS(rx); + } + + /* RT #124109. This is a complete hack; in the SWASHNEW case we know + * that PL_curpm will be null, but that utf8.pm and the modules it + * loads will only use $1..$3. + * The t/porting/re_context.t test file checks this assumption. + */ + if (nparens == -1) + nparens = 3; + + for (i = 1; i <= nparens; i++) { + char digits[TYPE_CHARS(long)]; + const STRLEN len = my_snprintf(digits, sizeof(digits), + "%lu", (long)i); + GV *const *const gvp + = (GV**)hv_fetch(PL_defstash, digits, len, 0); + + if (gvp) { + GV * const gv = *gvp; + if (SvTYPE(gv) == SVt_PVGV && GvSV(gv)) + save_scalar(gv); + } + } +} +#endif + #ifdef DEBUGGING STATIC void @@ -17886,9 +18145,13 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) this_end = (end < NUM_ANYOF_CODE_POINTS) ? end : NUM_ANYOF_CODE_POINTS - 1; +#if NUM_ANYOF_CODE_POINTS > 256 format = (this_end < 256) ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}" : "\\x{%04"UVXf"}-\\x{%04"UVXf"}"; +#else + format = "\\x{%02"UVXf"}-\\x{%02"UVXf"}"; +#endif GCC_DIAG_IGNORE(-Wformat-nonliteral); Perl_sv_catpvf(aTHX_ sv, format, start, this_end); GCC_DIAG_RESTORE; @@ -18174,11 +18437,5 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, #endif /* DEBUGGING */ /* - * Local variables: - * c-indentation-style: bsd - * c-basic-offset: 4 - * indent-tabs-mode: nil - * End: - * * ex: set ts=8 sts=4 sw=4 et: */