X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/e7267dfc40d2acfbd9b984b7da2585a62df1b772..73103bd71331afc1a8e2fcbb0b7a406151e835fd:/regcomp.c diff --git a/regcomp.c b/regcomp.c index f26676b..df60d1b 100644 --- a/regcomp.c +++ b/regcomp.c @@ -86,9 +86,8 @@ EXTERN_C const struct regexp_engine my_reg_engine; # include "regcomp.h" #endif -#include "dquote_static.c" -#include "charclass_invlists.h" -#include "inline_invlist.c" +#include "dquote_inline.h" +#include "invlist_inline.h" #include "unicode_constants.h" #define HAS_NONLATIN1_FOLD_CLOSURE(i) \ @@ -181,6 +180,9 @@ struct RExC_state_t { I32 contains_locale; I32 contains_i; I32 override_recoding; +#ifdef EBCDIC + I32 recode_x_to_native; +#endif I32 in_multi_char_class; struct reg_code_block *code_blocks; /* positions of literal (?{}) within pattern */ @@ -212,6 +214,7 @@ struct RExC_state_t { #define RExC_mysv2 (pRExC_state->mysv2) #endif + bool seen_unfolded_sharp_s; }; #define RExC_flags (pRExC_state->flags) @@ -224,6 +227,17 @@ struct RExC_state_t { #define RExC_end (pRExC_state->end) #define RExC_parse (pRExC_state->parse) #define RExC_whilem_seen (pRExC_state->whilem_seen) + +/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any + * EXACTF node, hence was parsed under /di rules. If later in the parse, + * something forces the pattern into using /ui rules, the sharp s should be + * folded into the sequence 'ss', which takes up more space than previously + * calculated. This means that the sizing pass needs to be restarted. (The + * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node + * that gets converted to /ui (and EXACTFU) occupies the same amount of space, + * so there is no need to resize [perl #125990]. */ +#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s) + #ifdef RE_TRACK_PATTERN_OFFSETS #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */ @@ -256,6 +270,9 @@ struct RExC_state_t { #define RExC_contains_locale (pRExC_state->contains_locale) #define RExC_contains_i (pRExC_state->contains_i) #define RExC_override_recoding (pRExC_state->override_recoding) +#ifdef EBCDIC +# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native) +#endif #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) #define RExC_frame_head (pRExC_state->frame_head) #define RExC_frame_last (pRExC_state->frame_last) @@ -294,7 +311,9 @@ struct RExC_state_t { #define SPSTART 0x04 /* Starts with * or + */ #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */ #define TRYAGAIN 0x10 /* Weeded out a declaration. */ -#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */ +#define RESTART_PASS1 0x20 /* Need to restart sizing pass */ +#define NEED_UTF8 0x40 /* In conjunction with RESTART_PASS1, need to + calcuate sizes as UTF-8 */ #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) @@ -313,12 +332,30 @@ struct RExC_state_t { #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren) #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren)) -#define REQUIRE_UTF8 STMT_START { \ +#define REQUIRE_UTF8(flagp) STMT_START { \ if (!UTF) { \ - *flagp = RESTART_UTF8; \ + assert(PASS1); \ + *flagp = RESTART_PASS1|NEED_UTF8; \ return NULL; \ } \ - } STMT_END + } STMT_END + +/* Change from /d into /u rules, and restart the parse if we've already seen + * something whose size would increase as a result, by setting *flagp and + * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates + * we've change to /u during the parse. */ +#define REQUIRE_UNI_RULES(flagp, restart_retval) \ + STMT_START { \ + if (DEPENDS_SEMANTICS) { \ + assert(PASS1); \ + set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \ + RExC_uni_semantics = 1; \ + if (RExC_seen_unfolded_sharp_s) { \ + *flagp |= RESTART_PASS1; \ + return restart_retval; \ + } \ + } \ + } STMT_END /* This converts the named class defined in regcomp.h to its equivalent class * number defined in handy.h. */ @@ -624,7 +661,7 @@ static const scan_data_t zero_scan_data = } STMT_END /* A specialized version of vFAIL2 that works with UTF8f */ -#define vFAIL2utf8f(m, a1) STMT_START { \ +#define vFAIL2utf8f(m, a1) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ if (!SIZE_ONLY) \ SAVEFREESV(RExC_rx_sv); \ @@ -632,6 +669,14 @@ static const scan_data_t zero_scan_data = REPORT_LOCATION_ARGS(offset)); \ } STMT_END +#define vFAIL3utf8f(m, a1, a2) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + /* These have asserts in them because of [perl #122671] Many warnings in * regcomp.c can occur twice. If they get output in pass1 and later in that * pass, the pattern has to be converted to UTF-8 and the pass restarted, they @@ -803,9 +848,6 @@ static const scan_data_t zero_scan_data = if (RExC_seen & REG_GPOS_SEEN) \ PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \ \ - if (RExC_seen & REG_CANY_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \ - \ if (RExC_seen & REG_RECURSE_SEEN) \ PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \ \ @@ -1175,7 +1217,9 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, /* If this can match all upper Latin1 code points, have to add them * as well */ - if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) { + if (OP(node) == ANYOFD + && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)) + { _invlist_union(invlist, PL_UpperLatin1, &invlist); } @@ -1253,12 +1297,19 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, * that should be; while the consequences for having /l bugs is * incorrect matches */ if (ssc_is_anything((regnode_ssc *)and_with)) { - anded_flags |= ANYOF_WARN_SUPER; + anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; } } else { anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); - anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; + if (OP(and_with) == ANYOFD) { + anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; + } + else { + anded_flags = ANYOF_FLAGS(and_with) + &( ANYOF_COMMON_FLAGS + |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER); + } } ANYOF_FLAGS(ssc) &= anded_flags; @@ -1409,6 +1460,11 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, else { ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; + if (OP(or_with) != ANYOFD) { + ored_flags + |= ANYOF_FLAGS(or_with) + & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; + } } ANYOF_FLAGS(ssc) |= ored_flags; @@ -1607,7 +1663,9 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) /* The code in this file assumes that all but these flags aren't relevant * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared * by the time we reach here */ - assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS)); + assert(! (ANYOF_FLAGS(ssc) + & ~( ANYOF_COMMON_FLAGS + |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))); populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); @@ -1621,6 +1679,10 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL; } + if (RExC_contains_locale) { + OP(ssc) = ANYOFL; + } + assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); } @@ -1999,7 +2061,7 @@ is the recommended Unicode-aware way of saying #define TRIE_STORE_REVCHAR(val) \ STMT_START { \ if (UTF) { \ - SV *zlopp = newSV(7); /* XXX: optimize me */ \ + SV *zlopp = newSV(UTF8_MAXBYTES); \ unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ SvCUR_set(zlopp, kapow - flrbbbbb); \ @@ -3650,6 +3712,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, * this function, we need to flag any occurrences of the sharp s. * This character forbids trie formation (because of added * complexity) */ +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) while (s < s_end) { if (*s == LATIN_SMALL_LETTER_SHARP_S) { OP(scan) = EXACTFA_NO_TRIE; @@ -3657,7 +3722,6 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, break; } s++; - continue; } } else { @@ -3703,6 +3767,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, *min_subtract += len - 1; s += len; } +#endif } } @@ -4386,7 +4451,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, ( flags & SCF_IN_DEFINE ) || ( - (is_inf_internal || is_inf || data->flags & SF_IS_INF) + (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF)) && ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 ) ) @@ -5064,7 +5129,6 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); #endif - case CANY: case SANY: if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ ssc_match_all_cp(data->start_class); @@ -5093,6 +5157,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", } break; + case ANYOFD: case ANYOFL: case ANYOF: if (flags & SCF_DO_STCLASS_AND) @@ -5638,7 +5703,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", } return final_minlen; } - NOT_REACHED; + NOT_REACHED; /* NOTREACHED */ } STATIC U32 @@ -6175,6 +6240,7 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, ENTER; SAVETMPS; + save_re_context(); PUSHSTACKi(PERLSI_REQUIRE); /* G_RE_REPARSING causes the toker to collapse \\ into \ when * parsing qr''; normally only q'' does this. It also alters @@ -6413,7 +6479,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, DEBUG_r(if (!PL_colorset) reginitcolors()); -#ifndef PERL_IN_XSUB_RE /* Initialize these here instead of as-needed, as is quick and avoids * having to test them each time otherwise */ if (! PL_AboveLatin1) { @@ -6431,7 +6496,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0, NUM_ANYOF_CODE_POINTS - 1); } -#endif pRExC_state->code_blocks = NULL; pRExC_state->num_code_blocks = 0; @@ -6543,7 +6607,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* ignore the utf8ness if the pattern is 0 length */ RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); + RExC_uni_semantics = 0; + RExC_seen_unfolded_sharp_s = 0; RExC_contains_locale = 0; RExC_contains_i = 0; RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT); @@ -6564,8 +6630,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, }); redo_first_pass: - /* we jump here if we upgrade the pattern to utf8 and have to - * recompile */ + /* we jump here if we have to recompile, e.g., from upgrading the pattern + * to utf8 */ if ((pm_flags & PMf_USE_RE_EVAL) /* this second condition covers the non-regex literal case, @@ -6599,7 +6665,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, if (rx_flags & PMf_FOLD) { RExC_contains_i = 1; } - if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { + if ( initial_charset == REGEX_DEPENDS_CHARSET + && (RExC_utf8 ||RExC_uni_semantics)) + { /* Set to use unicode semantics if the pattern is in utf8 and has the * 'depends' charset specified, as it means unicode when utf8 */ @@ -6632,6 +6700,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_extralen = 0; RExC_override_recoding = 0; +#ifdef EBCDIC + RExC_recode_x_to_native = 0; +#endif RExC_in_multi_char_class = 0; /* First pass: determine size, legality. */ @@ -6682,9 +6753,16 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, at least some part of the pattern, and therefore must convert the whole thing. -- dmq */ - if (flags & RESTART_UTF8) { - S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, + if (flags & RESTART_PASS1) { + if (flags & NEED_UTF8) { + S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, pRExC_state->num_code_blocks); + } + else { + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + "Need to redo pass 1\n")); + } + goto redo_first_pass; } Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags); @@ -6751,7 +6829,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, for (n = 0; n < pRExC_state->num_code_blocks; n++) if (pRExC_state->code_blocks[n].src_regex) SAVEFREESV(pRExC_state->code_blocks[n].src_regex); - SAVEFREEPV(pRExC_state->code_blocks); + if(pRExC_state->code_blocks) + SAVEFREEPV(pRExC_state->code_blocks); /* often null */ } { @@ -6766,25 +6845,25 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, || ! has_charset); bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN) == REG_RUN_ON_COMMENT_SEEN); - U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) + U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD) >> RXf_PMf_STD_PMMOD_SHIFT); const char *fptr = STD_PAT_MODS; /*"msixn"*/ char *p; - /* Allocate for the worst case, which is all the std flags are turned - * on. If more precision is desired, we could do a population count of - * the flags set. This could be done with a small lookup table, or by - * shifting, masking and adding, or even, when available, assembly - * language for a machine-language population count. - * We never output a minus, as all those are defaults, so are + + /* We output all the necessary flags; we never output a minus, as all + * those are defaults, so are * covered by the caret */ const STRLEN wraplen = plen + has_p + has_runon + has_default /* If needs a caret */ + + PL_bitcount[reganch] /* 1 char for each set standard flag */ /* If needs a character set specifier */ + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0) - + (sizeof(STD_PAT_MODS) - 1) + (sizeof("(?:)") - 1); + /* make sure PL_bitcount bounds not exceeded */ + assert(sizeof(STD_PAT_MODS) <= 8); + Newx(p, wraplen + 1, char); /* +1 for the ending NUL */ r->xpv_len_u.xpvlenu_pv = p; if (RExC_utf8) @@ -7281,8 +7360,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, lookbehind */ if (pRExC_state->num_code_blocks) r->extflags |= RXf_EVAL_SEEN; - if (RExC_seen & REG_CANY_SEEN) - r->intflags |= PREGf_CANY_SEEN; if (RExC_seen & REG_VERBARG_SEEN) { r->intflags |= PREGf_VERBARG_SEEN; @@ -7313,7 +7390,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, * flags appropriately - Yves */ regnode *first = ri->program + 1; U8 fop = OP(first); - regnode *next = NEXTOPER(first); + regnode *next = regnext(first); U8 nop = OP(next); if (PL_regkind[fop] == NOTHING && nop == END) @@ -7327,13 +7404,13 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, r->extflags |= RXf_START_ONLY; else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE - && OP(regnext(first)) == END) + && nop == END) r->extflags |= RXf_WHITE; else if ( r->extflags & RXf_SPLIT && (fop == EXACT || fop == EXACTL) && STR_LEN(first) == 1 && *(STRING(first)) == ' ' - && OP(regnext(first)) == END ) + && nop == END ) r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); } @@ -7694,13 +7771,8 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, sv_setpvn(sv, s, i); TAINT_set(oldtainted); #endif - if ( (rx->intflags & PREGf_CANY_SEEN) - ? (RXp_MATCH_UTF8(rx) - && (!i || is_utf8_string((U8*)s, i))) - : (RXp_MATCH_UTF8(rx)) ) - { + if (RXp_MATCH_UTF8(rx)) SvUTF8_on(sv); - } else SvUTF8_off(sv); if (TAINTING_get) { @@ -7894,7 +7966,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name", (unsigned long) flags); } - NOT_REACHED; /* NOT REACHED */ + NOT_REACHED; /* NOTREACHED */ } return NULL; } @@ -7985,7 +8057,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) * Some of the methods should always be private to the implementation, and some * should eventually be made public */ -/* The header definitions are in F */ +/* The header definitions are in F */ PERL_STATIC_INLINE UV* S__invlist_array_init(SV* const invlist, const bool will_have_0) @@ -9002,7 +9074,13 @@ Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) } /* Here, can't just append things, create and return a new inversion list - * which is the union of this range and the existing inversion list */ + * which is the union of this range and the existing inversion list. (If + * the new range is well-behaved wrt to the old one, we could just insert + * it, doing a Move() down on the tail of the old one (potentially growing + * it first). But to determine that means we would have the extra + * (possibly throw-away) work of first finding where the new one goes and + * whether it disrupts (splits) an existing range, so it doesn't appear to + * me (khw) that it's worth it) */ range_invlist = _new_invlist(2); _append_range_to_invlist(range_invlist, start, end); @@ -9719,9 +9797,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) ++RExC_parse; } - if (PASS2) { - STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count); - } + vFAIL("Sequence (?... not terminated"); } /* @@ -9741,10 +9817,10 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) #endif /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets - flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan - needs to be restarted. - Otherwise would only return NULL if regbranch() returns NULL, which - cannot happen. */ + flags. Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan + needs to be restarted, or'd with NEED_UTF8 if the pattern needs to be + upgraded to UTF-8. Otherwise would only return NULL if regbranch() returns + NULL, which cannot happen. */ STATIC regnode * S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter. @@ -9790,9 +9866,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) STRLEN verb_len = 0; char *start_arg = NULL; unsigned char op = 0; - int argok = 1; - int internal_argval = 0; /* internal_argval is only useful if - !argok */ + int arg_required = 0; + int internal_argval = -1; /* if >-1 we are not allowed an argument*/ if (has_intervening_patws) { RExC_parse++; @@ -9834,14 +9909,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case 'F': /* (*FAIL) */ if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) { op = OPFAIL; - argok = 0; } break; case ':': /* (*:NAME) */ case 'M': /* (*MARK:NAME) */ if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) { op = MARKPOINT; - argok = -1; + arg_required = 1; } break; case 'P': /* (*PRUNE) */ @@ -9866,36 +9940,30 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) "Unknown verb pattern '%"UTF8f"'", UTF8fARG(UTF, verb_len, start_verb)); } - if ( argok ) { - if ( start_arg && internal_argval ) { - vFAIL3("Verb pattern '%.*s' may not have an argument", - verb_len, start_verb); - } else if ( argok < 0 && !start_arg ) { - vFAIL3("Verb pattern '%.*s' has a mandatory argument", - verb_len, start_verb); - } else { - ret = reganode(pRExC_state, op, internal_argval); - if ( ! internal_argval && ! SIZE_ONLY ) { - if (start_arg) { - SV *sv = newSVpvn( start_arg, - RExC_parse - start_arg); - ARG(ret) = add_data( pRExC_state, - STR_WITH_LEN("S")); - RExC_rxi->data->data[ARG(ret)]=(void*)sv; - ret->flags = 0; - } else { - ret->flags = 1; - } - } - } - if (!internal_argval) - RExC_seen |= REG_VERBARG_SEEN; - } else if ( start_arg ) { - vFAIL3("Verb pattern '%.*s' may not have an argument", - verb_len, start_verb); - } else { - ret = reg_node(pRExC_state, op); - } + if ( arg_required && !start_arg ) { + vFAIL3("Verb pattern '%.*s' has a mandatory argument", + verb_len, start_verb); + } + if (internal_argval == -1) { + ret = reganode(pRExC_state, op, 0); + } else { + ret = reg2Lanode(pRExC_state, op, 0, internal_argval); + } + RExC_seen |= REG_VERBARG_SEEN; + if ( ! SIZE_ONLY ) { + if (start_arg) { + SV *sv = newSVpvn( start_arg, + RExC_parse - start_arg); + ARG(ret) = add_data( pRExC_state, + STR_WITH_LEN("S")); + RExC_rxi->data->data[ARG(ret)]=(void*)sv; + ret->flags = 1; + } else { + ret->flags = 0; + } + if ( internal_argval != -1 ) + ARG2L_SET(ret, internal_argval); + } nextchar(pRExC_state); return ret; } @@ -10055,10 +10123,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '!': /* (?!...) */ RExC_seen_zerolen++; /* check if we're really just a "FAIL" assertion */ - --RExC_parse; - nextchar(pRExC_state); + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + FALSE /* Don't force to /x */ ); if (*RExC_parse == ')') { - ret=reg_node(pRExC_state, OPFAIL); + ret=reganode(pRExC_state, OPFAIL, 0); nextchar(pRExC_state); return ret; } @@ -10098,14 +10166,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (RExC_parse == RExC_end || *RExC_parse != ')') vFAIL("Sequence (?&... not terminated"); goto gen_recurse_regop; - /* NOT REACHED */ + /* NOTREACHED */ case '+': if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { RExC_parse++; vFAIL("Illegal pattern"); } goto parse_recursion; - /* NOT REACHED*/ + /* NOTREACHED*/ case '-': /* (?-1) */ if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { RExC_parse--; /* rewind to let it be handled later */ @@ -10124,10 +10192,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse++; is_neg = TRUE; } - unum = grok_atou(RExC_parse, &endptr); - num = (unum > I32_MAX) ? I32_MAX : (I32)unum; - if (endptr) - RExC_parse = (char*)endptr; + if (grok_atoUV(RExC_parse, &unum, &endptr) + && unum <= I32_MAX + ) { + num = (I32)unum; + RExC_parse = (char*)endptr; + } else + num = I32_MAX; if (is_neg) { /* Some limit for num? */ num = -num; @@ -10178,7 +10249,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) nextchar(pRExC_state); return ret; - /* NOT REACHED */ + /* NOTREACHED */ case '?': /* (??...) */ is_logical = 1; @@ -10258,9 +10329,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) int is_define= 0; const int DEFINE_len = sizeof("DEFINE") - 1; if (RExC_parse[0] == '?') { /* (?(?...)) */ - if (RExC_parse[1] == '=' || RExC_parse[1] == '!' - || RExC_parse[1] == '<' - || RExC_parse[1] == '{') { /* Lookahead or eval. */ + if ( + RExC_parse[1] == '=' || + RExC_parse[1] == '!' || + RExC_parse[1] == '<' || + RExC_parse[1] == '{' + ) { /* Lookahead or eval. */ I32 flag; regnode *tail; @@ -10269,8 +10343,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret->flags = 1; tail = reg(pRExC_state, 1, &flag, depth+1); - if (flag & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flag & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flag & (RESTART_PASS1|NEED_UTF8); return NULL; } REGTAIL(pRExC_state, ret, tail); @@ -10311,9 +10385,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse++; parno = 0; if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { - parno = grok_atou(RExC_parse, &endptr); - if (endptr) + UV uv; + if (grok_atoUV(RExC_parse, &uv, &endptr) + && uv <= I32_MAX + ) { + parno = (I32)uv; RExC_parse = (char*)endptr; + } + /* else "Switch condition not recognized" below */ } else if (RExC_parse[0] == '&') { SV *sv_dat; RExC_parse++; @@ -10329,27 +10408,30 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { /* (?(1)...) */ char c; - char *tmp; - parno = grok_atou(RExC_parse, &endptr); - if (endptr) - RExC_parse = (char*)endptr; + UV uv; + if (grok_atoUV(RExC_parse, &uv, &endptr) + && uv <= I32_MAX + ) { + parno = (I32)uv; + RExC_parse = (char*)endptr; + } + else { + vFAIL("panic: grok_atoUV returned FALSE"); + } ret = reganode(pRExC_state, GROUPP, parno); insert_if_check_paren: - if (*(tmp = nextchar(pRExC_state)) != ')') { - /* nextchar also skips comments, so undo its work - * and skip over the the next character. - */ - RExC_parse = tmp; + if (UCHARAT(RExC_parse) != ')') { RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; vFAIL("Switch condition not recognized"); } + nextchar(pRExC_state); insert_if: REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); br = regbranch(pRExC_state, &flags, 1,depth+1); if (br == NULL) { - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", @@ -10357,7 +10439,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0)); - c = *nextchar(pRExC_state); + c = UCHARAT(RExC_parse); + nextchar(pRExC_state); if (flags&HASWIDTH) *flagp |= HASWIDTH; if (c == '|') { @@ -10368,8 +10451,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) lastbr = reganode(pRExC_state, IFTHEN, 0); if (!regbranch(pRExC_state, &flags, 1,depth+1)) { - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", @@ -10378,7 +10461,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) REGTAIL(pRExC_state, ret, lastbr); if (flags&HASWIDTH) *flagp |= HASWIDTH; - c = *nextchar(pRExC_state); + c = UCHARAT(RExC_parse); + nextchar(pRExC_state); } else lastbr = NULL; @@ -10450,6 +10534,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) Set_Node_Offset(ret, RExC_parse); /* MJD */ is_open = 1; } else { + /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */ + paren = ':'; ret = NULL; } } @@ -10464,8 +10550,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* branch_len = (paren != 0); */ if (br == NULL) { - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); @@ -10511,8 +10597,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) br = regbranch(pRExC_state, &flags, 0, depth+1); if (br == NULL) { - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); @@ -10643,12 +10729,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* Check for proper termination. */ if (paren) { - /* restore original flags, but keep (?p) */ + /* restore original flags, but keep (?p) and, if we've changed from /d + * rules to /u, keep the /u */ RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY); - if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') { + if (DEPENDS_SEMANTICS && RExC_uni_semantics) { + set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); + } + if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') { RExC_parse = oregcomp_parse; vFAIL("Unmatched ("); } + nextchar(pRExC_state); } else if (!paren && RExC_parse < RExC_end) { if (*RExC_parse == ')') { @@ -10673,8 +10764,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) * * Implements the concatenation operator. * - * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be - * restarted. + * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be + * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8 */ STATIC regnode * S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) @@ -10705,16 +10796,16 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) *flagp = WORST; /* Tentatively. */ - RExC_parse--; - nextchar(pRExC_state); + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + FALSE /* Don't force to /x */ ); while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') { flags &= ~TRYAGAIN; latest = regpiece(pRExC_state, &flags,depth+1); if (latest == NULL) { if (flags & TRYAGAIN) continue; - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags); @@ -10756,8 +10847,8 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) * * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with * TRYAGAIN. - * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be - * restarted. + * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be + * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8 */ STATIC regnode * S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) @@ -10773,6 +10864,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) char *parse_start; #endif const char *maxpos = NULL; + UV uv; /* Save the original in case we change the emitted regop to a FAIL. */ regnode * const orig_emit = RExC_emit; @@ -10785,8 +10877,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = regatom(pRExC_state, &flags,depth+1); if (ret == NULL) { - if (flags & (TRYAGAIN|RESTART_UTF8)) - *flagp |= flags & (TRYAGAIN|RESTART_UTF8); + if (flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8)) + *flagp |= flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8); else FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags); return(NULL); @@ -10814,16 +10906,28 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (!maxpos) maxpos = next; RExC_parse++; - min = grok_atou(RExC_parse, &endptr); + if (isDIGIT(*RExC_parse)) { + if (!grok_atoUV(RExC_parse, &uv, &endptr)) + vFAIL("Invalid quantifier in {,}"); + if (uv >= REG_INFTY) + vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); + min = (I32)uv; + } else { + min = 0; + } if (*maxpos == ',') maxpos++; else maxpos = RExC_parse; - max = grok_atou(maxpos, &endptr); - if (!max && *maxpos != '0') + if (isDIGIT(*maxpos)) { + if (!grok_atoUV(maxpos, &uv, &endptr)) + vFAIL("Invalid quantifier in {,}"); + if (uv >= REG_INFTY) + vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); + max = (I32)uv; + } else { max = REG_INFTY; /* meaning "infinity" */ - else if (max >= REG_INFTY) - vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); + } RExC_parse = next; nextchar(pRExC_state); if (max < min) { /* If can't match, warn and optimize to fail @@ -10834,31 +10938,40 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * enough space for all the things we are about to throw * away, but we can shrink it by the ammount we are about * to re-use here */ - RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL]; + RExC_size += PREVOPER(RExC_size) - regarglen[(U8)OPFAIL]; } else { ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); RExC_emit = orig_emit; } - ret = reg_node(pRExC_state, OPFAIL); + ret = reganode(pRExC_state, OPFAIL, 0); return ret; } - else if (min == max - && RExC_parse < RExC_end - && (*RExC_parse == '?' || *RExC_parse == '+')) + else if (min == max && RExC_parse < RExC_end && *RExC_parse == '?') { if (PASS2) { ckWARN2reg(RExC_parse + 1, "Useless use of greediness modifier '%c'", *RExC_parse); } - /* Absorb the modifier, so later code doesn't see nor use - * it */ - nextchar(pRExC_state); } do_curly: if ((flags&SIMPLE)) { + if (min == 0 && max == REG_INFTY) { + reginsert(pRExC_state, STAR, ret, depth+1); + ret->flags = 0; + MARK_NAUGHTY(4); + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; + goto nest_check; + } + if (min == 1 && max == REG_INFTY) { + reginsert(pRExC_state, PLUS, ret, depth+1); + ret->flags = 0; + MARK_NAUGHTY(3); + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; + goto nest_check; + } MARK_NAUGHTY_EXP(2, 2); reginsert(pRExC_state, CURLY, ret, depth+1); Set_Node_Offset(ret, parse_start+1); /* MJD */ @@ -10932,22 +11045,10 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); - if (op == '*' && (flags&SIMPLE)) { - reginsert(pRExC_state, STAR, ret, depth+1); - ret->flags = 0; - MARK_NAUGHTY(4); - RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; - } - else if (op == '*') { + if (op == '*') { min = 0; goto do_curly; } - else if (op == '+' && (flags&SIMPLE)) { - reginsert(pRExC_state, PLUS, ret, depth+1); - ret->flags = 0; - MARK_NAUGHTY(3); - RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; - } else if (op == '+') { min = 1; goto do_curly; @@ -10993,95 +11094,95 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) return(ret); } -STATIC STRLEN -S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, - UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse +STATIC bool +S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, + regnode ** node_p, + UV * code_point_p, + int * cp_count, + I32 * flagp, + const U32 depth ) { - - /* This is expected to be called by a parser routine that has recognized '\N' - and needs to handle the rest. RExC_parse is expected to point at the first - char following the N at the time of the call. On successful return, - RExC_parse has been updated to point to just after the sequence identified - by this routine, <*flagp> has been updated, and the non-NULL input pointers - have been set appropriately. - - The typical case for this is \N{some character name}. This is usually - called while parsing the input, filling in or ready to fill in an EXACTish - node, and the code point for the character should be returned, so that it - can be added to the node, and parsing continued with the next input - character. But it may be that instead of a single character the \N{} - expands to more than one, a named sequence. In this case any following - quantifier applies to the whole sequence, and it is easier, given the code - structure that calls this, to handle it from a different area of the code. - For this reason, the input parameters can be set so that it returns valid - only on one or the other of these cases. - - Another possibility is for the input to be an empty \N{}, which for - backwards compatibility we accept, but generate a NOTHING node which should - later get optimized out. This is handled from the area of code which can - handle a named sequence, so if called with the parameters for the other, it - fails. - - Still another possibility is for the \N to mean [^\n], and not a single - character or explicit sequence at all. This is determined by context. - Again, this is handled from the area of code which can handle a named - sequence, so if called with the parameters for the other, it also fails. - - And the final possibility is for the \N to be called from within a bracketed - character class. In this case the [^\n] meaning makes no sense, and so is - an error. Other anomalous situations are left to the calling code to handle. - - For non-single-quoted regexes, the tokenizer has attempted to decide which - of the above applies, and in the case of a named sequence, has converted it - into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...}, - where c1... are the characters in the sequence. For single-quoted regexes, - the tokenizer passes the \N sequence through unchanged; this code will not - attempt to determine this nor expand those, instead raising a syntax error. - The net effect is that if the beginning of the passed-in pattern isn't '{U+' - or there is no '}', it signals that this \N occurrence means to match a - non-newline. (This mostly was done because of [perl #56444].) - - The API is somewhat convoluted due to historical and the above reasons. - - The function raises an error (via vFAIL), and doesn't return for various - syntax errors. For other failures, it returns (STRLEN) -1. For successes, - it returns a count of how many characters were accounted for by it. (This - can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code - points in the sequence. It sets , , and/or - on success. - - If is non-null, it means the caller can accept an input sequence - consisting of just a single code point; <*valuep> is set to the value of the - only or first code point in the input. - - If is non-null, it means the caller can accept an input - sequence consisting of one or more code points; <*substitute_parse> is a - newly created mortal SV* in this case, containing \x{} escapes representing - those code points. - - Both and can be non-NULL. - - If is non-null, must be NULL. This signifies - that the caller can accept any legal sequence other than a single code - point. To wit, <*node_p> is set as follows: - 1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1 - 2) \N{}: points to a new NOTHING node; return is 0 - 3) otherwise: points to a new EXACT node containing the resolved - string; return is the number of code points in the - string. This will never be 1. - Note that failure is returned for single code point sequences if is - null and is not. - */ - - char * endbrace; /* '}' following the name */ - char* p; + /* This routine teases apart the various meanings of \N and returns + * accordingly. The input parameters constrain which meaning(s) is/are valid + * in the current context. + * + * Exactly one of and must be non-NULL. + * + * If is not NULL, the context is expecting the result to be a + * single code point. If this \N instance turns out to a single code point, + * the function returns TRUE and sets *code_point_p to that code point. + * + * If is not NULL, the context is expecting the result to be one of + * the things representable by a regnode. If this \N instance turns out to be + * one such, the function generates the regnode, returns TRUE and sets *node_p + * to point to that regnode. + * + * If this instance of \N isn't legal in any context, this function will + * generate a fatal error and not return. + * + * On input, RExC_parse should point to the first char following the \N at the + * time of the call. On successful return, RExC_parse will have been updated + * to point to just after the sequence identified by this routine. Also + * *flagp has been updated as needed. + * + * When there is some problem with the current context and this \N instance, + * the function returns FALSE, without advancing RExC_parse, nor setting + * *node_p, nor *code_point_p, nor *flagp. + * + * If is not NULL, the caller wants to know the length (in code + * points) that this \N sequence matches. This is set even if the function + * returns FALSE, as detailed below. + * + * There are 5 possibilities here, as detailed in the next 5 paragraphs. + * + * Probably the most common case is for the \N to specify a single code point. + * *cp_count will be set to 1, and *code_point_p will be set to that code + * point. + * + * Another possibility is for the input to be an empty \N{}, which for + * backwards compatibility we accept. *cp_count will be set to 0. *node_p + * will be set to a generated NOTHING node. + * + * Still another possibility is for the \N to mean [^\n]. *cp_count will be + * set to 0. *node_p will be set to a generated REG_ANY node. + * + * The fourth possibility is that \N resolves to a sequence of more than one + * code points. *cp_count will be set to the number of code points in the + * sequence. *node_p * will be set to a generated node returned by this + * function calling S_reg(). + * + * The final possibility is that it is premature to be calling this function; + * that pass1 needs to be restarted. This can happen when this changes from + * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The + * latter occurs only when the fourth possibility would otherwise be in + * effect, and is because one of those code points requires the pattern to be + * recompiled as UTF-8. The function returns FALSE, and sets the + * RESTART_PASS1 and NEED_UTF8 flags in *flagp, as appropriate. When this + * happens, the caller needs to desist from continuing parsing, and return + * this information to its caller. This is not set for when there is only one + * code point, as this can be called as part of an ANYOF node, and they can + * store above-Latin1 code points without the pattern having to be in UTF-8. + * + * For non-single-quoted regexes, the tokenizer has resolved character and + * sequence names inside \N{...} into their Unicode values, normalizing the + * result into what we should see here: '\N{U+c1.c2...}', where c1... are the + * hex-represented code points in the sequence. This is done there because + * the names can vary based on what charnames pragma is in scope at the time, + * so we need a way to take a snapshot of what they resolve to at the time of + * the original parse. [perl #56444]. + * + * That parsing is skipped for single-quoted regexes, so we may here get + * '\N{NAME}'. This is a fatal error. These names have to be resolved by the + * parser. But if the single-quoted regex is something like '\N{U+41}', that + * is legal and handled here. The code point is Unicode, and has to be + * translated into the native character set for non-ASCII platforms. + */ + + char * endbrace; /* points to '}' following the name */ char *endchar; /* Points to '.' or '}' ending cur char in the input stream */ - bool has_multiple_chars; /* true if the input stream contains a sequence of - more than one character */ - bool in_char_class = substitute_parse != NULL; - STRLEN count = 0; /* Number of characters in this sequence */ + char* p = RExC_parse; /* Temporary */ GET_RE_DEBUG_FLAGS_DECL; @@ -11089,36 +11190,37 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, GET_RE_DEBUG_FLAGS; - assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */ - assert(! (node_p && substitute_parse)); /* At most 1 should be set */ + assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */ + assert(! (node_p && cp_count)); /* At most 1 should be set */ + + if (cp_count) { /* Initialize return for the most common case */ + *cp_count = 1; + } /* The [^\n] meaning of \N ignores spaces and comments under the /x - * modifier. The other meaning does not, so use a temporary until we find + * modifier. The other meanings do not, so use a temporary until we find * out which we are being called with */ - p = (RExC_flags & RXf_PMf_EXTENDED) - ? regpatws(pRExC_state, RExC_parse, - TRUE) /* means recognize comments */ - : RExC_parse; + skip_to_be_ignored_text(pRExC_state, &p, + FALSE /* Don't force to /x */ ); /* Disambiguate between \N meaning a named character versus \N meaning - * [^\n]. The former is assumed when it can't be the latter. */ + * [^\n]. The latter is assumed when the {...} following the \N is a legal + * quantifier, or there is no '{' at all */ if (*p != '{' || regcurly(p)) { RExC_parse = p; + if (cp_count) { + *cp_count = -1; + } + if (! node_p) { - /* no bare \N allowed in a charclass */ - if (in_char_class) { - vFAIL("\\N in a character class must be a named character: \\N{...}"); - } - return (STRLEN) -1; + return FALSE; } - RExC_parse--; /* Need to back off so nextchar() doesn't skip the - current char */ - nextchar(pRExC_state); + *node_p = reg_node(pRExC_state, REG_ANY); *flagp |= HASWIDTH|SIMPLE; MARK_NAUGHTY(1); Set_Node_Length(*node_p, 1); /* MJD */ - return 1; + return TRUE; } /* Here, we have decided it should be a named character or sequence */ @@ -11143,17 +11245,20 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, vFAIL("\\N{NAME} must be resolved by the lexer"); } - RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */ + REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode + semantics */ if (endbrace == RExC_parse) { /* empty: \N{} */ - if (node_p) { - *node_p = reg_node(pRExC_state,NOTHING); - } - else if (! in_char_class) { - return (STRLEN) -1; + if (cp_count) { + *cp_count = 0; } nextchar(pRExC_state); - return 0; + if (! node_p) { + return FALSE; + } + + *node_p = reg_node(pRExC_state,NOTHING); + return TRUE; } RExC_parse += 2; /* Skip past the 'U+' */ @@ -11162,116 +11267,128 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, /* Code points are separated by dots. If none, there is only one code * point, and is terminated by the brace */ - has_multiple_chars = (endchar < endbrace); - /* We get the first code point if we want it, and either there is only one, - * or we can accept both cases of one and there is more than one */ - if (valuep && (substitute_parse || ! has_multiple_chars)) { - STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse); - I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES + if (endchar >= endbrace) { + STRLEN length_of_hex; + I32 grok_hex_flags; + + /* Here, exactly one code point. If that isn't what is wanted, fail */ + if (! code_point_p) { + RExC_parse = p; + return FALSE; + } + + /* Convert code point from hex */ + length_of_hex = (STRLEN)(endchar - RExC_parse); + grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES | PERL_SCAN_DISALLOW_PREFIX /* No errors in the first pass (See [perl * #122671].) We let the code below find the * errors when there are multiple chars. */ - | ((SIZE_ONLY || has_multiple_chars) + | ((SIZE_ONLY) ? PERL_SCAN_SILENT_ILLDIGIT : 0); - *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL); + /* This routine is the one place where both single- and double-quotish + * \N{U+xxxx} are evaluated. The value is a Unicode code point which + * must be converted to native. */ + *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse, + &length_of_hex, + &grok_hex_flags, + NULL)); /* The tokenizer should have guaranteed validity, but it's possible to * bypass it by using single quoting, so check. Don't do the check * here when there are multiple chars; we do it below anyway. */ - if (! has_multiple_chars) { - if (length_of_hex == 0 - || length_of_hex != (STRLEN)(endchar - RExC_parse) ) - { - RExC_parse += length_of_hex; /* Includes all the valid */ - RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ - ? UTF8SKIP(RExC_parse) - : 1; - /* Guard against malformed utf8 */ - if (RExC_parse >= endchar) { - RExC_parse = endchar; - } - vFAIL("Invalid hexadecimal number in \\N{U+...}"); + if (length_of_hex == 0 + || length_of_hex != (STRLEN)(endchar - RExC_parse) ) + { + RExC_parse += length_of_hex; /* Includes all the valid */ + RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ + ? UTF8SKIP(RExC_parse) + : 1; + /* Guard against malformed utf8 */ + if (RExC_parse >= endchar) { + RExC_parse = endchar; } - - RExC_parse = endbrace + 1; - return 1; + vFAIL("Invalid hexadecimal number in \\N{U+...}"); } - } - /* Here, we should have already handled the case where a single character - * is expected and found. So it is a failure if we aren't expecting - * multiple chars and got them; or didn't get them but wanted them. We - * fail without advancing the parse, so that the caller can try again with - * different acceptance criteria */ - if ((! node_p && ! substitute_parse) || ! has_multiple_chars) { - RExC_parse = p; - return (STRLEN) -1; + RExC_parse = endbrace + 1; + return TRUE; } - - { - /* What is done here is to convert this to a sub-pattern of the form - * \x{char1}\x{char2}... - * and then either return it in <*substitute_parse> if non-null; or - * call reg recursively to parse it (enclosing in "(?: ... )" ). That - * way, it retains its atomicness, while not having to worry about - * special handling that some code points may have. toke.c has - * converted the original Unicode values to native, so that we can just - * pass on the hex values unchanged. We do have to set a flag to keep - * recoding from happening in the recursion */ - - SV * dummy = NULL; + else { /* Is a multiple character sequence */ + SV * substitute_parse; STRLEN len; char *orig_end = RExC_end; I32 flags; - if (substitute_parse) { - *substitute_parse = newSVpvs(""); + /* Count the code points, if desired, in the sequence */ + if (cp_count) { + *cp_count = 0; + while (RExC_parse < endbrace) { + /* Point to the beginning of the next character in the sequence. */ + RExC_parse = endchar + 1; + endchar = RExC_parse + strcspn(RExC_parse, ".}"); + (*cp_count)++; + } } - else { - substitute_parse = &dummy; - *substitute_parse = newSVpvs("?:"); + + /* Fail if caller doesn't want to handle a multi-code-point sequence. + * But don't backup up the pointer if the caller want to know how many + * code points there are (they can then handle things) */ + if (! node_p) { + if (! cp_count) { + RExC_parse = p; + } + return FALSE; } - *substitute_parse = sv_2mortal(*substitute_parse); + + /* What is done here is to convert this to a sub-pattern of the form + * \x{char1}\x{char2}... and then call reg recursively to parse it + * (enclosing in "(?: ... )" ). That way, it retains its atomicness, + * while not having to worry about special handling that some code + * points may have. */ + + substitute_parse = newSVpvs("?:"); while (RExC_parse < endbrace) { /* Convert to notation the rest of the code understands */ - sv_catpv(*substitute_parse, "\\x{"); - sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse); - sv_catpv(*substitute_parse, "}"); + sv_catpv(substitute_parse, "\\x{"); + sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse); + sv_catpv(substitute_parse, "}"); /* Point to the beginning of the next character in the sequence. */ RExC_parse = endchar + 1; endchar = RExC_parse + strcspn(RExC_parse, ".}"); - count++; } - if (! in_char_class) { - sv_catpv(*substitute_parse, ")"); - } + sv_catpv(substitute_parse, ")"); - RExC_parse = SvPV(*substitute_parse, len); + RExC_parse = SvPV(substitute_parse, len); /* Don't allow empty number */ - if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) { + if (len < (STRLEN) 8) { RExC_parse = endbrace; vFAIL("Invalid hexadecimal number in \\N{U+...}"); } RExC_end = RExC_parse + len; - /* The values are Unicode, and therefore not subject to recoding */ + /* The values are Unicode, and therefore not subject to recoding, but + * have to be converted to native on a non-Unicode (meaning non-ASCII) + * platform. */ RExC_override_recoding = 1; +#ifdef EBCDIC + RExC_recode_x_to_native = 1; +#endif if (node_p) { if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) { - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; - return (STRLEN) -1; + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); + return FALSE; } FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"", (UV) flags); @@ -11279,14 +11396,19 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); } + /* Restore the saved values */ RExC_parse = endbrace; RExC_end = orig_end; RExC_override_recoding = 0; +#ifdef EBCDIC + RExC_recode_x_to_native = 0; +#endif + SvREFCNT_dec_NN(substitute_parse); nextchar(pRExC_state); - } - return count; + return TRUE; + } } @@ -11301,10 +11423,10 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, * it returns U+FFFD (Replacement character) and sets *encp to NULL. */ STATIC UV -S_reg_recode(pTHX_ const char value, SV **encp) +S_reg_recode(pTHX_ const U8 value, SV **encp) { STRLEN numlen = 1; - SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP); + SV * const sv = newSVpvn_flags((const char *) &value, numlen, SVs_TEMP); const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); const STRLEN newlen = SvCUR(sv); UV uv = UNICODE_REPLACEMENT; @@ -11452,8 +11574,13 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, *character = (U8) code_point; len = 1; } /* Else is folded non-UTF8 */ +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { - +#else + else if (1) { +#endif /* We don't fold any non-UTF8 except possibly the Sharp s (see * comments at join_exact()); */ *character = (U8) code_point; @@ -11496,10 +11623,14 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, /* A single character node is SIMPLE, except for the special-cased SHARP S * under /di. */ - if ((len == 1 || (UTF && len == UNISKIP(code_point))) - && (code_point != LATIN_SMALL_LETTER_SHARP_S - || ! FOLD || ! DEPENDS_SEMANTICS)) - { + if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point))) +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) + && ( code_point != LATIN_SMALL_LETTER_SHARP_S + || ! FOLD || ! DEPENDS_SEMANTICS) +#endif + ) { *flagp |= SIMPLE; } @@ -11517,10 +11648,10 @@ static I32 S_backref_value(char *p) { const char* endptr; - UV val = grok_atou(p, &endptr); - if (endptr == p || endptr == NULL || val > I32_MAX) - return I32_MAX; - return (I32)val; + UV val; + if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX) + return (I32)val; + return I32_MAX; } @@ -11585,8 +11716,8 @@ S_backref_value(char *p) Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with TRYAGAIN. - Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be - restarted. + Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be + restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8 Otherwise does not return NULL. */ @@ -11595,7 +11726,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { regnode *ret = NULL; I32 flags = 0; - char *parse_start = RExC_parse; + char *parse_start; U8 op; int invert = 0; U8 arg; @@ -11609,6 +11740,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) PERL_ARGS_ASSERT_REGATOM; tryagain: + parse_start = RExC_parse; switch ((U8)*RExC_parse) { case '^': RExC_seen_zerolen++; @@ -11647,17 +11779,18 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) TRUE, /* allow multi-char folds */ FALSE, /* don't silence non-portable warnings. */ (bool) RExC_strict, + TRUE, /* Allow an optimized regnode result */ NULL); - if (*RExC_parse != ']') { - RExC_parse = oregcomp_parse; - vFAIL("Unmatched ["); - } if (ret == NULL) { - if (*flagp & RESTART_UTF8) + if (*flagp & (RESTART_PASS1|NEED_UTF8)) return NULL; FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", (UV) *flagp); } + if (*RExC_parse != ']') { + RExC_parse = oregcomp_parse; + vFAIL("Unmatched ["); + } nextchar(pRExC_state); Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ break; @@ -11674,8 +11807,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } goto tryagain; } - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", @@ -11749,13 +11882,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) RExC_seen_zerolen++; /* Do not optimize RE away */ goto finish_meta_pat; case 'C': - ret = reg_node(pRExC_state, CANY); - RExC_seen |= REG_CANY_SEEN; - *flagp |= HASWIDTH|SIMPLE; - if (PASS2) { - ckWARNdep(RExC_parse+1, "\\C is deprecated"); - } - goto finish_meta_pat; + vFAIL("\\C no longer supported"); case 'X': ret = reg_node(pRExC_state, CLUMP); *flagp |= HASWIDTH; @@ -11772,27 +11899,102 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) invert = 1; /* FALLTHROUGH */ case 'b': + { + regex_charset charset = get_regex_charset(RExC_flags); + RExC_seen_zerolen++; RExC_seen |= REG_LOOKBEHIND_SEEN; - op = BOUND + get_regex_charset(RExC_flags); - if (op > BOUNDA) { /* /aa is same as /a */ - op = BOUNDA; - } - else if (op == BOUNDL) { - RExC_contains_locale = 1; - } + op = BOUND + charset; - if (invert) { - op += NBOUND - BOUND; + if (op == BOUNDL) { + RExC_contains_locale = 1; } ret = reg_node(pRExC_state, op); *flagp |= SIMPLE; - if ((U8) *(RExC_parse + 1) == '{') { - /* diag_listed_as: Use "%s" instead of "%s" */ - vFAIL3("Use \"\\%c\\{\" instead of \"\\%c{\"", *RExC_parse, *RExC_parse); + if (*(RExC_parse + 1) != '{') { + FLAGS(ret) = TRADITIONAL_BOUND; + if (PASS2 && op > BOUNDA) { /* /aa is same as /a */ + OP(ret) = BOUNDA; + } + } + else { + STRLEN length; + char name = *RExC_parse; + char * endbrace; + RExC_parse += 2; + endbrace = strchr(RExC_parse, '}'); + + if (! endbrace) { + vFAIL2("Missing right brace on \\%c{}", name); + } + /* XXX Need to decide whether to take spaces or not. Should be + * consistent with \p{}, but that currently is SPACE, which + * means vertical too, which seems wrong + * while (isBLANK(*RExC_parse)) { + RExC_parse++; + }*/ + if (endbrace == RExC_parse) { + RExC_parse++; /* After the '}' */ + vFAIL2("Empty \\%c{}", name); + } + length = endbrace - RExC_parse; + /*while (isBLANK(*(RExC_parse + length - 1))) { + length--; + }*/ + switch (*RExC_parse) { + case 'g': + if (length != 1 + && (length != 3 || strnNE(RExC_parse + 1, "cb", 2))) + { + goto bad_bound_type; + } + FLAGS(ret) = GCB_BOUND; + break; + case 's': + if (length != 2 || *(RExC_parse + 1) != 'b') { + goto bad_bound_type; + } + FLAGS(ret) = SB_BOUND; + break; + case 'w': + if (length != 2 || *(RExC_parse + 1) != 'b') { + goto bad_bound_type; + } + FLAGS(ret) = WB_BOUND; + break; + default: + bad_bound_type: + RExC_parse = endbrace; + vFAIL2utf8f( + "'%"UTF8f"' is an unknown bound type", + UTF8fARG(UTF, length, endbrace - length)); + NOT_REACHED; /*NOTREACHED*/ + } + RExC_parse = endbrace; + REQUIRE_UNI_RULES(flagp, NULL); + + if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */ + OP(ret) = BOUNDU; + length += 4; + + /* Don't have to worry about UTF-8, in this message because + * to get here the contents of the \b must be ASCII */ + ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */ + "Using /u for '%.*s' instead of /%s", + (unsigned) length, + endbrace - length + 1, + (charset == REGEX_ASCII_RESTRICTED_CHARSET) + ? ASCII_RESTRICT_PAT_MODS + : ASCII_MORE_RESTRICT_PAT_MODS); + } } + + if (PASS2 && invert) { + OP(ret) += NBOUND - BOUND; + } goto finish_meta_pat; + } case 'D': invert = 1; @@ -11866,55 +12068,63 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) break; case 'p': case 'P': - { -#ifdef DEBUGGING - char* parse_start = RExC_parse - 2; -#endif + RExC_parse--; + + ret = regclass(pRExC_state, flagp,depth+1, + TRUE, /* means just parse this element */ + FALSE, /* don't allow multi-char folds */ + FALSE, /* don't silence non-portable warnings. It + would be a bug if these returned + non-portables */ + (bool) RExC_strict, + TRUE, /* Allow an optimized regnode result */ + NULL); + if (*flagp & RESTART_PASS1) + return NULL; + /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if + * multi-char folds are allowed. */ + if (!ret) + FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", + (UV) *flagp); - RExC_parse--; - - ret = regclass(pRExC_state, flagp,depth+1, - TRUE, /* means just parse this element */ - FALSE, /* don't allow multi-char folds */ - FALSE, /* don't silence non-portable warnings. - It would be a bug if these returned - non-portables */ - (bool) RExC_strict, - NULL); - /* regclass() can only return RESTART_UTF8 if multi-char folds - are allowed. */ - if (!ret) - FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", - (UV) *flagp); - - RExC_parse--; - - Set_Node_Offset(ret, parse_start + 2); - Set_Node_Cur_Length(ret, parse_start); - nextchar(pRExC_state); - } + RExC_parse--; + + Set_Node_Offset(ret, parse_start); + Set_Node_Cur_Length(ret, parse_start - 2); + nextchar(pRExC_state); break; case 'N': - /* Handle \N and \N{NAME} with multiple code points here and not - * below because it can be multicharacter. join_exact() will join - * them up later on. Also this makes sure that things like - * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq. - * The options to the grok function call causes it to fail if the - * sequence is just a single code point. We then go treat it as - * just another character in the current EXACT node, and hence it - * gets uniform treatment with all the other characters. The - * special treatment for quantifiers is not needed for such single - * character sequences */ + /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the + * \N{...} evaluates to a sequence of more than one code points). + * The function call below returns a regnode, which is our result. + * The parameters cause it to fail if the \N{} evaluates to a + * single code point; we handle those like any other literal. The + * reason that the multicharacter case is handled here and not as + * part of the EXACtish code is because of quantifiers. In + * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it + * this way makes that Just Happen. dmq. + * join_exact() will join this up with adjacent EXACTish nodes + * later on, if appropriate. */ ++RExC_parse; - if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp, - depth, FALSE)) - { - if (*flagp & RESTART_UTF8) - return NULL; - RExC_parse--; - goto defchar; + if (grok_bslash_N(pRExC_state, + &ret, /* Want a regnode returned */ + NULL, /* Fail if evaluates to a single code + point */ + NULL, /* Don't need a count of how many code + points */ + flagp, + depth) + ) { + break; } - break; + + if (*flagp & RESTART_PASS1) + return NULL; + + /* Here, evaluates to a single code point. Go get that */ + RExC_parse = parse_start; + goto defchar; + case 'k': /* Handle \k and \k'NAME' */ parse_named_seq: { @@ -12006,56 +12216,68 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } else { num = S_backref_value(RExC_parse); - /* bare \NNN might be backref or octal - if it is larger than or equal - * RExC_npar then it is assumed to be and octal escape. - * Note RExC_npar is +1 from the actual number of parens*/ + /* bare \NNN might be backref or octal - if it is larger + * than or equal RExC_npar then it is assumed to be an + * octal escape. Note RExC_npar is +1 from the actual + * number of parens. */ /* Note we do NOT check if num == I32_MAX here, as that is * handled by the RExC_npar check */ - if (num > 9 && num >= RExC_npar - && *RExC_parse != '8' && *RExC_parse != '9') + + if ( + /* any numeric escape < 10 is always a backref */ + num > 9 + /* any numeric escape < RExC_npar is a backref */ + && num >= RExC_npar + /* cannot be an octal escape if it starts with 8 */ + && *RExC_parse != '8' + /* cannot be an octal escape it it starts with 9 */ + && *RExC_parse != '9' + ) { - /* Probably a character specified in octal, e.g. \35 */ + /* Probably not a backref, instead likely to be an + * octal character escape, e.g. \35 or \777. + * The above logic should make it obvious why using + * octal escapes in patterns is problematic. - Yves */ + RExC_parse = parse_start; goto defchar; } } - /* at this point RExC_parse definitely points to a backref - * number */ - { -#ifdef RE_TRACK_PATTERN_OFFSETS - char * const parse_start = RExC_parse - 1; /* MJD */ -#endif - while (isDIGIT(*RExC_parse)) - RExC_parse++; - if (hasbrace) { - if (*RExC_parse != '}') - vFAIL("Unterminated \\g{...} pattern"); - RExC_parse++; - } - if (!SIZE_ONLY) { - if (num > (I32)RExC_rx->nparens) - vFAIL("Reference to nonexistent group"); - } - RExC_sawback = 1; - ret = reganode(pRExC_state, - ((! FOLD) - ? REF - : (ASCII_FOLD_RESTRICTED) - ? REFFA - : (AT_LEAST_UNI_SEMANTICS) - ? REFFU - : (LOC) - ? REFFL - : REFF), - num); - *flagp |= HASWIDTH; + /* At this point RExC_parse points at a numeric escape like + * \12 or \88 or something similar, which we should NOT treat + * as an octal escape. It may or may not be a valid backref + * escape. For instance \88888888 is unlikely to be a valid + * backref. */ + while (isDIGIT(*RExC_parse)) + RExC_parse++; + if (hasbrace) { + if (*RExC_parse != '}') + vFAIL("Unterminated \\g{...} pattern"); + RExC_parse++; + } + if (!SIZE_ONLY) { + if (num > (I32)RExC_rx->nparens) + vFAIL("Reference to nonexistent group"); + } + RExC_sawback = 1; + ret = reganode(pRExC_state, + ((! FOLD) + ? REF + : (ASCII_FOLD_RESTRICTED) + ? REFFA + : (AT_LEAST_UNI_SEMANTICS) + ? REFFU + : (LOC) + ? REFFL + : REFF), + num); + *flagp |= HASWIDTH; - /* override incorrect value set in reganode MJD */ - Set_Node_Offset(ret, parse_start+1); - Set_Node_Cur_Length(ret, parse_start); - RExC_parse--; - nextchar(pRExC_state); - } + /* override incorrect value set in reganode MJD */ + Set_Node_Offset(ret, parse_start); + Set_Node_Cur_Length(ret, parse_start-1); + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + FALSE /* Don't force to /x */ ); } break; case '\0': @@ -12065,26 +12287,34 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) default: /* Do not generate "unrecognized" warnings here, we fall back into the quick-grab loop below */ - parse_start--; + RExC_parse = parse_start; goto defchar; - } + } /* end of switch on a \foo sequence */ break; case '#': - if (RExC_flags & RXf_PMf_EXTENDED) { + + /* '#' comments should have been spaced over before this function was + * called */ + assert((RExC_flags & RXf_PMf_EXTENDED) == 0); + /* + if (RExC_flags & RXf_PMf_EXTENDED) { RExC_parse = reg_skipcomment( pRExC_state, RExC_parse ); if (RExC_parse < RExC_end) goto tryagain; } + */ + /* FALLTHROUGH */ default: + defchar: { - parse_start = RExC_parse - 1; - - RExC_parse++; + /* Here, we have determined that the next thing is probably a + * literal character. RExC_parse points to the first byte of its + * definition. (It still may be an escape sequence that evaluates + * to a single character) */ - defchar: { STRLEN len = 0; UV ender = 0; char *p; @@ -12102,7 +12332,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * string's UTF8ness. The reason to do this is that EXACTF is not * trie-able, EXACTFU is. * - * Similarly, we can convert EXACTFL nodes to EXACTFU if they + * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they * contain only above-Latin1 characters (hence must be in UTF8), * which don't participate in folds with Latin1-range characters, * as the latter's folds aren't known until runtime. (We don't @@ -12125,8 +12355,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reparse: - /* We do the EXACTFish to EXACT node only if folding. (And we - * don't need to figure this out until pass 2) */ + /* We look for the EXACTFish to EXACT node optimizaton only if + * folding. (And we don't need to figure this out until pass 2) */ maybe_exact = FOLD && PASS2; /* XXX The node can hold up to 255 bytes, yet this only goes to @@ -12147,15 +12377,21 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * could back off to end with only a code point that isn't such a * non-final, but it is possible for there not to be any in the * entire node. */ - for (p = RExC_parse - 1; + + assert( ! UTF /* Is at the beginning of a character */ + || UTF8_IS_INVARIANT(UCHARAT(RExC_parse)) + || UTF8_IS_START(UCHARAT(RExC_parse))); + + for (p = RExC_parse; len < upper_parse && p < RExC_end; len++) { oldp = p; - if (RExC_flags & RXf_PMf_EXTENDED) - p = regpatws(pRExC_state, p, - TRUE); /* means recognize comments */ + /* White space has already been ignored */ + assert( (RExC_flags & RXf_PMf_EXTENDED) == 0 + || ! is_PATWS_safe((p), RExC_end, UTF)); + switch ((U8)*p) { case '^': case '$': @@ -12207,24 +12443,32 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) p++; break; case 'N': /* Handle a single-code point named character. */ - /* The options cause it to fail if a multiple code - * point sequence. Handle those in the switch() above - * */ RExC_parse = p + 1; - if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL, - &ender, - flagp, - depth, - FALSE - )) { - if (*flagp & RESTART_UTF8) - FAIL("panic: grok_bslash_N set RESTART_UTF8"); + if (! grok_bslash_N(pRExC_state, + NULL, /* Fail if evaluates to + anything other than a + single code point */ + &ender, /* The returned single code + point */ + NULL, /* Don't need a count of + how many code points */ + flagp, + depth) + ) { + if (*flagp & NEED_UTF8) + FAIL("panic: grok_bslash_N set NEED_UTF8"); + if (*flagp & RESTART_PASS1) + return NULL; + + /* Here, it wasn't a single code point. Go close + * up this EXACTish node. The switch() prior to + * this switch handles the other cases */ RExC_parse = p = oldp; goto loopdone; } p = RExC_parse; if (ender > 0xff) { - REQUIRE_UTF8; + REQUIRE_UTF8(flagp); } break; case 'r': @@ -12271,7 +12515,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto recode_encoding; } if (ender > 0xff) { - REQUIRE_UTF8; + REQUIRE_UTF8(flagp); } break; } @@ -12297,11 +12541,19 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } ender = result; - if (IN_ENCODING && ender < 0x100) { - goto recode_encoding; + if (ender < 0x100) { +#ifdef EBCDIC + if (RExC_recode_x_to_native) { + ender = LATIN1_TO_NATIVE(ender); + } + else +#endif + if (IN_ENCODING) { + goto recode_encoding; + } } - if (ender > 0xff) { - REQUIRE_UTF8; + else { + REQUIRE_UTF8(flagp); } break; } @@ -12311,6 +12563,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) break; case '8': case '9': /* must be a backreference */ --p; + /* we have an escape like \8 which cannot be an octal escape + * so we exit the loop, and let the outer loop handle this + * escape which may or may not be a legitimate backref. */ goto loopdone; case '1': case '2': case '3':case '4': case '5': case '6': case '7': @@ -12343,7 +12598,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) STRLEN numlen = 3; ender = grok_oct(p, &numlen, &flags, NULL); if (ender > 0xff) { - REQUIRE_UTF8; + REQUIRE_UTF8(flagp); } p += numlen; if (PASS2 /* like \08, \178 */ @@ -12362,10 +12617,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) recode_encoding: if (! RExC_override_recoding) { SV* enc = _get_encoding(); - ender = reg_recode((const char)(U8)ender, &enc); + ender = reg_recode((U8)ender, &enc); if (!enc && PASS2) ckWARNreg(p, "Invalid escape in the specified encoding"); - REQUIRE_UTF8; + REQUIRE_UTF8(flagp); } break; case '\0': @@ -12374,7 +12629,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* FALLTHROUGH */ default: if (!SIZE_ONLY&& isALPHANUMERIC(*p)) { - /* Include any { following the alpha to emphasize + /* Include any left brace following the alpha to emphasize * that it could be part of an escape at some point * in the future */ int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1; @@ -12386,7 +12641,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) case '{': /* Currently we don't warn when the lbrace is at the start * of a construct. This catches it in the middle of a - * literal string, or when its the first thing after + * literal string, or when it's the first thing after * something like "\b" */ if (! SIZE_ONLY && (len || (p > RExC_start && isALPHA_A(*(p -1))))) @@ -12396,7 +12651,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /*FALLTHROUGH*/ default: /* A literal character */ normal_default: - if (UTF8_IS_START(*p) && UTF) { + if (! UTF8_IS_INVARIANT(*p) && UTF) { STRLEN numlen; ender = utf8n_to_uvchr((U8*)p, RExC_end - p, &numlen, UTF8_ALLOW_DEFAULT); @@ -12408,12 +12663,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } /* End of switch on the literal */ /* Here, have looked at the literal character and - * contains its ordinal,

points to the character after it - */ - - if ( RExC_flags & RXf_PMf_EXTENDED) - p = regpatws(pRExC_state, p, - TRUE); /* means recognize comments */ + * contains its ordinal,

points to the character after it. + * We need to check if the next non-ignored thing is a + * quantifier. Move

to after anything that should be + * ignored, which, as a side effect, positions

for the next + * loop iteration */ + skip_to_be_ignored_text(pRExC_state, &p, + FALSE /* Don't force to /x */ ); /* If the next thing is a quantifier, it applies to this * character only, which means that this character has to be in @@ -12422,12 +12678,16 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * the node, close the node with just them, and set up to do * this character again next time through, when it will be the * only thing in its new node */ - if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len) + if ((next_is_quantifier = ( LIKELY(p < RExC_end) + && UNLIKELY(ISMULT2(p)))) + && LIKELY(len)) { p = oldp; goto loopdone; } + /* Ready to add 'ender' to the node */ + if (! FOLD) { /* The simple case, just append the literal */ /* In the sizing pass, we need only the size of the @@ -12435,7 +12695,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * its representation until PASS2. */ if (SIZE_ONLY) { if (UTF) { - const STRLEN unilen = UNISKIP(ender); + const STRLEN unilen = UVCHR_SKIP(ender); s += unilen; /* We have to subtract 1 just below (and again in @@ -12486,11 +12746,15 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } else /* A regular FOLD code point */ if (! ( UTF +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) /* See comments for join_exact() as to why we fold this * non-UTF at compile time */ || (node_type == EXACTFU - && ender == LATIN_SMALL_LETTER_SHARP_S))) - { + && ender == LATIN_SMALL_LETTER_SHARP_S) +#endif + )) { /* Here, are folding and are not UTF-8 encoded; therefore * the character must be in the range 0-255, and is not /l * (Not /l because we already handled these under /l in @@ -12501,22 +12765,29 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* See if the character's fold differs between /d and * /u. This includes the multi-char fold SHARP S to * 'ss' */ - if (maybe_exactfu - && (PL_fold[ender] != PL_fold_latin1[ender] - || ender == LATIN_SMALL_LETTER_SHARP_S - || (len > 0 - && isALPHA_FOLD_EQ(ender, 's') - && isALPHA_FOLD_EQ(*(s-1), 's')))) - { + if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) { + RExC_seen_unfolded_sharp_s = 1; maybe_exactfu = FALSE; } - } + else if (maybe_exactfu + && (PL_fold[ender] != PL_fold_latin1[ender] +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) + || ( len > 0 + && isALPHA_FOLD_EQ(ender, 's') + && isALPHA_FOLD_EQ(*(s-1), 's')) +#endif + )) { + maybe_exactfu = FALSE; + } + } /* Even when folding, we store just the input character, as * we have an array that finds its fold quickly */ *(s++) = (char) ender; } - else { /* FOLD and UTF */ + else { /* FOLD, and UTF (or sharp s) */ /* Unlike the non-fold case, we do actually have to * calculate the results here in pass 1. This is for two * reasons, the folded length may be longer than the @@ -12646,7 +12917,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { - if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( + if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE( *s, *(s+1)))) { break; @@ -12779,7 +13050,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) RExC_parse = p - 1; Set_Node_Cur_Length(ret, parse_start); - nextchar(pRExC_state); + RExC_parse = p; + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + FALSE /* Don't force to /x */ ); { /* len is STRLEN which is unsigned, need to copy to signed */ IV iv = len; @@ -12794,29 +13067,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) return(ret); } -STATIC char * -S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) -{ - /* Returns the next non-pattern-white space, non-comment character (the - * latter only if 'recognize_comment is true) in the string p, which is - * ended by RExC_end. See also reg_skipcomment */ - const char *e = RExC_end; - - PERL_ARGS_ASSERT_REGPATWS; - - while (p < e) { - STRLEN len; - if ((len = is_PATWS_safe(p, e, UTF))) { - p += len; - } - else if (recognize_comment && *p == '#') { - p = reg_skipcomment(pRExC_state, p); - } - else - break; - } - return p; -} STATIC void S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) @@ -12958,7 +13208,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict) break; case 'e': if (memEQ(posixcc, "spac", 4)) /* space */ - namedclass = ANYOF_PSXSPC; + namedclass = ANYOF_SPACE; break; case 'h': if (memEQ(posixcc, "grap", 4)) /* graph */ @@ -13092,6 +13342,34 @@ S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state) && first_char == *(p - 1)); } +STATIC unsigned int +S_regex_set_precedence(const U8 my_operator) { + + /* Returns the precedence in the (?[...]) construct of the input operator, + * specified by its character representation. The precedence follows + * general Perl rules, but it extends this so that ')' and ']' have (low) + * precedence even though they aren't really operators */ + + switch (my_operator) { + case '!': + return 5; + case '&': + return 4; + case '^': + case '|': + case '+': + case '-': + return 3; + case ')': + return 2; + case ']': + return 1; + } + + NOT_REACHED; /* NOTREACHED */ + return 0; /* Silence compiler warning */ +} + STATIC regnode * S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth, @@ -13099,24 +13377,38 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, { /* Handle the (?[...]) construct to do set operations */ - U8 curchar; - UV start, end; /* End points of code point ranges */ - SV* result_string; - char *save_end, *save_parse; - SV* final; - STRLEN len; - regnode* node; - AV* stack; - const bool save_fold = FOLD; + U8 curchar; /* Current character being parsed */ + UV start, end; /* End points of code point ranges */ + SV* final = NULL; /* The end result inversion list */ + SV* result_string; /* 'final' stringified */ + AV* stack; /* stack of operators and operands not yet + resolved */ + AV* fence_stack = NULL; /* A stack containing the positions in + 'stack' of where the undealt-with left + parens would be if they were actually + put there */ + IV fence = 0; /* Position of where most recent undealt- + with left paren in stack is; -1 if none. + */ + STRLEN len; /* Temporary */ + regnode* node; /* Temporary, and final regnode returned by + this function */ + const bool save_fold = FOLD; /* Temporary */ + char *save_end, *save_parse; /* Temporaries */ + const bool in_locale = LOC; /* we turn off /l during processing */ GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_HANDLE_REGEX_SETS; - if (LOC) { - vFAIL("(?[...]) not valid in locale"); + if (in_locale) { + set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); } - RExC_uni_semantics = 1; + + REQUIRE_UNI_RULES(flagp, NULL); /* The use of this operator implies /u. + This is required so that the compile + time values are valid in all runtime + cases */ /* This will return only an ANYOF regnode, or (unlikely) something smaller * (such as EXACT). Thus we can skip most everything if just sizing. We @@ -13125,22 +13417,15 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, * upon an unescaped ']' that isn't one ending a regclass. To do both * these things, we need to realize that something preceded by a backslash * is escaped, so we have to keep track of backslashes */ - if (PASS2) { - Perl_ck_warner_d(aTHX_ - packWARN(WARN_EXPERIMENTAL__REGEX_SETS), - "The regex_sets feature is experimental" REPORT_LOCATION, - UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp), - UTF8fARG(UTF, - RExC_end - RExC_start - (RExC_parse - RExC_precomp), - RExC_precomp + (RExC_parse - RExC_precomp))); - } - else { + if (SIZE_ONLY) { UV depth = 0; /* how many nested (?[...]) constructs */ while (RExC_parse < RExC_end) { SV* current = NULL; - RExC_parse = regpatws(pRExC_state, RExC_parse, - TRUE); /* means recognize comments */ + + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + TRUE /* Force /x */ ); + switch (*RExC_parse) { case '?': if (RExC_parse[1] == '[') depth++, RExC_parse++; @@ -13155,6 +13440,10 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, * default: case next time and keep on incrementing until * we find one of the invariants we do handle. */ RExC_parse++; + if (*RExC_parse == 'c') { + /* Skip the \cX notation for control characters */ + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + } break; case '[': { @@ -13169,8 +13458,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, RExC_parse++; } - /* regclass() can only return RESTART_UTF8 if multi-char - folds are allowed. */ + /* regclass() can only return RESTART_PASS1 and NEED_UTF8 + * if multi-char folds are allowed. */ if (!regclass(pRExC_state, flagp,depth+1, is_posix_class, /* parse the whole char class only if not a @@ -13178,10 +13467,11 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, FALSE, /* don't allow multi-char folds */ TRUE, /* silence non-portable warnings. */ TRUE, /* strict */ + FALSE, /* Require return to be an ANYOF */ ¤t )) - FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", - (UV) *flagp); + FAIL2("panic: regclass returned NULL to handle_sets, " + "flags=%#"UVxf"", (UV) *flagp); /* function call leaves parse pointing to the ']', except * if we faked it */ @@ -13204,80 +13494,140 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, nextchar(pRExC_state); Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ + if (in_locale) { + set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET); + } + return node; } goto no_close; } - RExC_parse++; + + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; } no_close: FAIL("Syntax error in (?[...])"); } - /* Pass 2 only after this. Everything in this construct is a - * metacharacter. Operands begin with either a '\' (for an escape - * sequence), or a '[' for a bracketed character class. Any other - * character should be an operator, or parenthesis for grouping. Both - * types of operands are handled by calling regclass() to parse them. It - * is called with a parameter to indicate to return the computed inversion - * list. The parsing here is implemented via a stack. Each entry on the - * stack is a single character representing one of the operators, or the - * '('; or else a pointer to an operand inversion list. */ - -#define IS_OPERAND(a) (! SvIOK(a)) - - /* The stack starts empty. It is a syntax error if the first thing parsed - * is a binary operator; everything else is pushed on the stack. When an - * operand is parsed, the top of the stack is examined. If it is a binary - * operator, the item before it should be an operand, and both are replaced - * by the result of doing that operation on the new operand and the one on - * the stack. Thus a sequence of binary operands is reduced to a single - * one before the next one is parsed. + /* Pass 2 only after this. */ + Perl_ck_warner_d(aTHX_ + packWARN(WARN_EXPERIMENTAL__REGEX_SETS), + "The regex_sets feature is experimental" REPORT_LOCATION, + UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp), + UTF8fARG(UTF, + RExC_end - RExC_start - (RExC_parse - RExC_precomp), + RExC_precomp + (RExC_parse - RExC_precomp))); + + /* Everything in this construct is a metacharacter. Operands begin with + * either a '\' (for an escape sequence), or a '[' for a bracketed + * character class. Any other character should be an operator, or + * parenthesis for grouping. Both types of operands are handled by calling + * regclass() to parse them. It is called with a parameter to indicate to + * return the computed inversion list. The parsing here is implemented via + * a stack. Each entry on the stack is a single character representing one + * of the operators; or else a pointer to an operand inversion list. */ + +#define IS_OPERATOR(a) SvIOK(a) +#define IS_OPERAND(a) (! IS_OPERATOR(a)) + + /* The stack is kept in Łukasiewicz order. (That's pronounced similar + * to luke-a-shave-itch (or -itz), but people who didn't want to bother + * with pronouncing it called it Reverse Polish instead, but now that YOU + * know how to pronounce it you can use the correct term, thus giving due + * credit to the person who invented it, and impressing your geek friends. + * Wikipedia says that the pronounciation of "Ł" has been changing so that + * it is now more like an English initial W (as in wonk) than an L.) + * + * This means that, for example, 'a | b & c' is stored on the stack as + * + * c [4] + * b [3] + * & [2] + * a [1] + * | [0] + * + * where the numbers in brackets give the stack [array] element number. + * In this implementation, parentheses are not stored on the stack. + * Instead a '(' creates a "fence" so that the part of the stack below the + * fence is invisible except to the corresponding ')' (this allows us to + * replace testing for parens, by using instead subtraction of the fence + * position). As new operands are processed they are pushed onto the stack + * (except as noted in the next paragraph). New operators of higher + * precedence than the current final one are inserted on the stack before + * the lhs operand (so that when the rhs is pushed next, everything will be + * in the correct positions shown above. When an operator of equal or + * lower precedence is encountered in parsing, all the stacked operations + * of equal or higher precedence are evaluated, leaving the result as the + * top entry on the stack. This makes higher precedence operations + * evaluate before lower precedence ones, and causes operations of equal + * precedence to left associate. * - * A unary operator may immediately follow a binary in the input, for - * example + * The only unary operator '!' is immediately pushed onto the stack when + * encountered. When an operand is encountered, if the top of the stack is + * a '!", the complement is immediately performed, and the '!' popped. The + * resulting value is treated as a new operand, and the logic in the + * previous paragraph is executed. Thus in the expression * [a] + ! [b] - * When an operand is parsed and the top of the stack is a unary operator, - * the operation is performed, and then the stack is rechecked to see if - * this new operand is part of a binary operation; if so, it is handled as - * above. + * the stack looks like * - * A '(' is simply pushed on the stack; it is valid only if the stack is - * empty, or the top element of the stack is an operator or another '(' - * (for which the parenthesized expression will become an operand). By the - * time the corresponding ')' is parsed everything in between should have - * been parsed and evaluated to a single operand (or else is a syntax - * error), and is handled as a regular operand */ + * ! + * a + * + + * + * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack + * becomes + * + * !b + * a + * + + * + * A ')' is treated as an operator with lower precedence than all the + * aforementioned ones, which causes all operations on the stack above the + * corresponding '(' to be evaluated down to a single resultant operand. + * Then the fence for the '(' is removed, and the operand goes through the + * algorithm above, without the fence. + * + * A separate stack is kept of the fence positions, so that the position of + * the latest so-far unbalanced '(' is at the top of it. + * + * The ']' ending the construct is treated as the lowest operator of all, + * so that everything gets evaluated down to a single operand, which is the + * result */ sv_2mortal((SV *)(stack = newAV())); + sv_2mortal((SV *)(fence_stack = newAV())); while (RExC_parse < RExC_end) { - I32 top_index = av_tindex(stack); - SV** top_ptr; - SV* current = NULL; + I32 top_index; /* Index of top-most element in 'stack' */ + SV** top_ptr; /* Pointer to top 'stack' element */ + SV* current = NULL; /* To contain the current inversion list + operand */ + SV* only_to_avoid_leaks; - /* Skip white space */ - RExC_parse = regpatws(pRExC_state, RExC_parse, - TRUE /* means recognize comments */ ); + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + TRUE /* Force /x */ ); if (RExC_parse >= RExC_end) { Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'"); } - if ((curchar = UCHARAT(RExC_parse)) == ']') { - break; - } + + curchar = UCHARAT(RExC_parse); + +redo_curchar: + + top_index = av_tindex(stack); switch (curchar) { + SV** stacked_ptr; /* Ptr to something already on 'stack' */ + char stacked_operator; /* The topmost operator on the 'stack'. */ + SV* lhs; /* Operand to the left of the operator */ + SV* rhs; /* Operand to the right of the operator */ + SV* fence_ptr; /* Pointer to top element of the fence + stack */ - case '?': - if (av_tindex(stack) >= 0 /* This makes sure that we can - safely subtract 1 from - RExC_parse in the next clause. - If we have something on the - stack, we have parsed something - */ - && UCHARAT(RExC_parse - 1) == '(' - && RExC_parse < RExC_end) + case '(': + + if (RExC_parse < RExC_end && (UCHARAT(RExC_parse + 1) == '?')) { /* If is a '(?', could be an embedded '(?flags:(?[...])'. * This happens when we have some thing like @@ -13292,14 +13642,18 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, * interpolated expression evaluates to. We use the flags * from the interpolated pattern. */ U32 save_flags = RExC_flags; - const char * const save_parse = ++RExC_parse; + const char * save_parse; + + RExC_parse += 2; /* Skip past the '(?' */ + save_parse = RExC_parse; + /* Parse any flags for the '(?' */ parse_lparen_question_flags(pRExC_state); if (RExC_parse == save_parse /* Makes sure there was at - least one flag (or this - embedding wasn't compiled) - */ + least one flag (or else + this embedding wasn't + compiled) */ || RExC_parse >= RExC_end - 4 || UCHARAT(RExC_parse) != ':' || UCHARAT(++RExC_parse) != '(' @@ -13319,38 +13673,67 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, } vFAIL("Expecting '(?flags:(?[...'"); } + + /* Recurse, with the meat of the embedded expression */ RExC_parse++; (void) handle_regex_sets(pRExC_state, ¤t, flagp, depth+1, oregcomp_parse); /* Here, 'current' contains the embedded expression's * inversion list, and RExC_parse points to the trailing - * ']'; the next character should be the ')' which will be - * paired with the '(' that has been put on the stack, so - * the whole embedded expression reduces to '(operand)' */ + * ']'; the next character should be the ')' */ + RExC_parse++; + assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')'); + + /* Then the ')' matching the original '(' handled by this + * case: statement */ RExC_parse++; + assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')'); + RExC_parse++; RExC_flags = save_flags; goto handle_operand; } - /* FALLTHROUGH */ - default: - RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; - vFAIL("Unexpected character"); + /* A regular '('. Look behind for illegal syntax */ + if (top_index - fence >= 0) { + /* If the top entry on the stack is an operator, it had + * better be a '!', otherwise the entry below the top + * operand should be an operator */ + if ( ! (top_ptr = av_fetch(stack, top_index, FALSE)) + || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!') + || ( IS_OPERAND(*top_ptr) + && ( top_index - fence < 1 + || ! (stacked_ptr = av_fetch(stack, + top_index - 1, + FALSE)) + || ! IS_OPERATOR(*stacked_ptr)))) + { + RExC_parse++; + vFAIL("Unexpected '(' with no preceding operator"); + } + } + + /* Stack the position of this undealt-with left paren */ + fence = top_index + 1; + av_push(fence_stack, newSViv(fence)); + break; case '\\': - /* regclass() can only return RESTART_UTF8 if multi-char - folds are allowed. */ + /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if + * multi-char folds are allowed. */ if (!regclass(pRExC_state, flagp,depth+1, TRUE, /* means parse just the next thing */ FALSE, /* don't allow multi-char folds */ FALSE, /* don't silence non-portable warnings. */ TRUE, /* strict */ - ¤t - )) - FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", - (UV) *flagp); + FALSE, /* Require return to be an ANYOF */ + ¤t)) + { + FAIL2("panic: regclass returned NULL to handle_sets, " + "flags=%#"UVxf"", (UV) *flagp); + } + /* regclass() will return with parsing just the \ sequence, * leaving the parse pointer at the next thing to parse */ RExC_parse--; @@ -13364,18 +13747,22 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, RExC_parse++; } - /* regclass() can only return RESTART_UTF8 if multi-char - folds are allowed. */ + /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if + * multi-char folds are allowed. */ if(!regclass(pRExC_state, flagp,depth+1, is_posix_class, /* parse the whole char class only if not a posix class */ FALSE, /* don't allow multi-char folds */ FALSE, /* don't silence non-portable warnings. */ TRUE, /* strict */ + FALSE, /* Require return to be an ANYOF */ ¤t )) - FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", - (UV) *flagp); + { + FAIL2("panic: regclass returned NULL to handle_sets, " + "flags=%#"UVxf"", (UV) *flagp); + } + /* function call leaves parse pointing to the ']', except if we * faked it */ if (is_posix_class) { @@ -13385,154 +13772,268 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, goto handle_operand; } + case ']': + if (top_index >= 1) { + goto join_operators; + } + + /* Only a single operand on the stack: are done */ + goto done; + + case ')': + if (av_tindex(fence_stack) < 0) { + RExC_parse++; + vFAIL("Unexpected ')'"); + } + + /* If at least two thing on the stack, treat this as an + * operator */ + if (top_index - fence >= 1) { + goto join_operators; + } + + /* Here only a single thing on the fenced stack, and there is a + * fence. Get rid of it */ + fence_ptr = av_pop(fence_stack); + assert(fence_ptr); + fence = SvIV(fence_ptr) - 1; + SvREFCNT_dec_NN(fence_ptr); + fence_ptr = NULL; + + if (fence < 0) { + fence = 0; + } + + /* Having gotten rid of the fence, we pop the operand at the + * stack top and process it as a newly encountered operand */ + current = av_pop(stack); + if (IS_OPERAND(current)) { + goto handle_operand; + } + + RExC_parse++; + goto bad_syntax; + case '&': case '|': case '+': case '-': case '^': - if (top_index < 0 + + /* These binary operators should have a left operand already + * parsed */ + if ( top_index - fence < 0 + || top_index - fence == 1 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE))) || ! IS_OPERAND(*top_ptr)) { - RExC_parse++; - vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar); + goto unexpected_binary; } - av_push(stack, newSVuv(curchar)); - break; - case '!': - av_push(stack, newSVuv(curchar)); - break; + /* If only the one operand is on the part of the stack visible + * to us, we just place this operator in the proper position */ + if (top_index - fence < 2) { - case '(': - if (top_index >= 0) { - top_ptr = av_fetch(stack, top_index, FALSE); - assert(top_ptr); - if (IS_OPERAND(*top_ptr)) { - RExC_parse++; - vFAIL("Unexpected '(' with no preceding operator"); - } + /* Place the operator before the operand */ + + SV* lhs = av_pop(stack); + av_push(stack, newSVuv(curchar)); + av_push(stack, lhs); + break; } - av_push(stack, newSVuv(curchar)); - break; - case ')': - { - SV* lparen; - if (top_index < 1 - || ! (current = av_pop(stack)) - || ! IS_OPERAND(current) - || ! (lparen = av_pop(stack)) - || IS_OPERAND(lparen) - || SvUV(lparen) != '(') + /* But if there is something else on the stack, we need to + * process it before this new operator if and only if the + * stacked operation has equal or higher precedence than the + * new one */ + + join_operators: + + /* The operator on the stack is supposed to be below both its + * operands */ + if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE)) + || IS_OPERAND(*stacked_ptr)) { - SvREFCNT_dec(current); + /* But if not, it's legal and indicates we are completely + * done if and only if we're currently processing a ']', + * which should be the final thing in the expression */ + if (curchar == ']') { + goto done; + } + + unexpected_binary: RExC_parse++; - vFAIL("Unexpected ')'"); + vFAIL2("Unexpected binary operator '%c' with no " + "preceding operand", curchar); } - top_index -= 2; - SvREFCNT_dec_NN(lparen); + stacked_operator = (char) SvUV(*stacked_ptr); - /* FALLTHROUGH */ - } + if (regex_set_precedence(curchar) + > regex_set_precedence(stacked_operator)) + { + /* Here, the new operator has higher precedence than the + * stacked one. This means we need to add the new one to + * the stack to await its rhs operand (and maybe more + * stuff). We put it before the lhs operand, leaving + * untouched the stacked operator and everything below it + * */ + lhs = av_pop(stack); + assert(IS_OPERAND(lhs)); - handle_operand: + av_push(stack, newSVuv(curchar)); + av_push(stack, lhs); + break; + } - /* Here, we have an operand to process, in 'current' */ + /* Here, the new operator has equal or lower precedence than + * what's already there. This means the operation already + * there should be performed now, before the new one. */ - if (top_index < 0) { /* Just push if stack is empty */ - av_push(stack, current); + rhs = av_pop(stack); + if (! IS_OPERAND(rhs)) { + + /* This can happen when a ! is not followed by an operand, + * like in /(?[\t &!])/ */ + goto bad_syntax; } - else { - SV* top = av_pop(stack); - SV *prev = NULL; - char current_operator; - - if (IS_OPERAND(top)) { - SvREFCNT_dec_NN(top); - SvREFCNT_dec_NN(current); - vFAIL("Operand with no preceding operator"); + + lhs = av_pop(stack); + assert(IS_OPERAND(lhs)); + + switch (stacked_operator) { + case '&': + _invlist_intersection(lhs, rhs, &rhs); + break; + + case '|': + case '+': + _invlist_union(lhs, rhs, &rhs); + break; + + case '-': + _invlist_subtract(lhs, rhs, &rhs); + break; + + case '^': /* The union minus the intersection */ + { + SV* i = NULL; + SV* u = NULL; + SV* element; + + _invlist_union(lhs, rhs, &u); + _invlist_intersection(lhs, rhs, &i); + /* _invlist_subtract will overwrite rhs + without freeing what it already contains */ + element = rhs; + _invlist_subtract(u, i, &rhs); + SvREFCNT_dec_NN(i); + SvREFCNT_dec_NN(u); + SvREFCNT_dec_NN(element); + break; } - current_operator = (char) SvUV(top); - switch (current_operator) { - case '(': /* Push the '(' back on followed by the new - operand */ - av_push(stack, top); - av_push(stack, current); - SvREFCNT_inc(top); /* Counters the '_dec' done - just after the 'break', so - it doesn't get wrongly freed - */ - break; + } + SvREFCNT_dec(lhs); + + /* Here, the higher precedence operation has been done, and the + * result is in 'rhs'. We overwrite the stacked operator with + * the result. Then we redo this code to either push the new + * operator onto the stack or perform any higher precedence + * stacked operation */ + only_to_avoid_leaks = av_pop(stack); + SvREFCNT_dec(only_to_avoid_leaks); + av_push(stack, rhs); + goto redo_curchar; + + case '!': /* Highest priority, right associative */ + + /* If what's already at the top of the stack is another '!", + * they just cancel each other out */ + if ( (top_ptr = av_fetch(stack, top_index, FALSE)) + && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!')) + { + only_to_avoid_leaks = av_pop(stack); + SvREFCNT_dec(only_to_avoid_leaks); + } + else { /* Otherwise, since it's right associative, just push + onto the stack */ + av_push(stack, newSVuv(curchar)); + } + break; - case '!': - _invlist_invert(current); - - /* Unlike binary operators, the top of the stack, - * now that this unary one has been popped off, may - * legally be an operator, and we now have operand - * for it. */ - top_index--; - SvREFCNT_dec_NN(top); - goto handle_operand; - - case '&': - prev = av_pop(stack); - _invlist_intersection(prev, - current, - ¤t); - av_push(stack, current); - break; + default: + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Unexpected character"); - case '|': - case '+': - prev = av_pop(stack); - _invlist_union(prev, current, ¤t); - av_push(stack, current); - break; + handle_operand: + + /* Here 'current' is the operand. If something is already on the + * stack, we have to check if it is a !. */ + top_index = av_tindex(stack); /* Code above may have altered the + * stack in the time since we + * earlier set 'top_index'. */ + if (top_index - fence >= 0) { + /* If the top entry on the stack is an operator, it had better + * be a '!', otherwise the entry below the top operand should + * be an operator */ + top_ptr = av_fetch(stack, top_index, FALSE); + assert(top_ptr); + if (IS_OPERATOR(*top_ptr)) { + + /* The only permissible operator at the top of the stack is + * '!', which is applied immediately to this operand. */ + curchar = (char) SvUV(*top_ptr); + if (curchar != '!') { + SvREFCNT_dec(current); + vFAIL2("Unexpected binary operator '%c' with no " + "preceding operand", curchar); + } - case '-': - prev = av_pop(stack);; - _invlist_subtract(prev, current, ¤t); - av_push(stack, current); - break; + _invlist_invert(current); - case '^': /* The union minus the intersection */ - { - SV* i = NULL; - SV* u = NULL; - SV* element; - - prev = av_pop(stack); - _invlist_union(prev, current, &u); - _invlist_intersection(prev, current, &i); - /* _invlist_subtract will overwrite current - without freeing what it already contains */ - element = current; - _invlist_subtract(u, i, ¤t); - av_push(stack, current); - SvREFCNT_dec_NN(i); - SvREFCNT_dec_NN(u); - SvREFCNT_dec_NN(element); - break; - } + only_to_avoid_leaks = av_pop(stack); + SvREFCNT_dec(only_to_avoid_leaks); + top_index = av_tindex(stack); - default: - Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack"); + /* And we redo with the inverted operand. This allows + * handling multiple ! in a row */ + goto handle_operand; + } + /* Single operand is ok only for the non-binary ')' + * operator */ + else if ((top_index - fence == 0 && curchar != ')') + || (top_index - fence > 0 + && (! (stacked_ptr = av_fetch(stack, + top_index - 1, + FALSE)) + || IS_OPERAND(*stacked_ptr)))) + { + SvREFCNT_dec(current); + vFAIL("Operand with no preceding operator"); } - SvREFCNT_dec_NN(top); - SvREFCNT_dec(prev); } - } + + /* Here there was nothing on the stack or the top element was + * another operand. Just add this new one */ + av_push(stack, current); + + } /* End of switch on next parse token */ RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + } /* End of loop parsing through the construct */ + + done: + if (av_tindex(fence_stack) >= 0) { + vFAIL("Unmatched ("); } if (av_tindex(stack) < 0 /* Was empty */ || ((final = av_pop(stack)) == NULL) || ! IS_OPERAND(final) + || SvTYPE(final) != SVt_INVLIST || av_tindex(stack) >= 0) /* More left on stack */ { + bad_syntax: + SvREFCNT_dec(final); vFAIL("Incomplete expression within '(?[ ])'"); } @@ -13557,6 +14058,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, } } + /* About to generate an ANYOF (or similar) node from the inversion list we + * have calculated */ save_parse = RExC_parse; RExC_parse = SvPV(result_string, len); save_end = RExC_end; @@ -13566,8 +14069,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, * already has all folding taken into consideration, and we don't want * regclass() to add to that */ RExC_flags &= ~RXf_PMf_FOLD; - /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed. - */ + /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if multi-char + * folds are allowed. */ node = regclass(pRExC_state, flagp,depth+1, FALSE, /* means parse the whole char class */ FALSE, /* don't allow multi-char folds */ @@ -13575,14 +14078,42 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, well have generated non-portable code points, but they're valid on this machine */ FALSE, /* similarly, no need for strict */ + FALSE, /* Require return to be an ANYOF */ NULL ); if (!node) FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf, PTR2UV(flagp)); + + /* Fix up the node type if we are in locale. (We have pretended we are + * under /u for the purposes of regclass(), as this construct will only + * work under UTF-8 locales. But now we change the opcode to be ANYOFL (so + * as to cause any warnings about bad locales to be output in regexec.c), + * and add the flag that indicates to check if not in a UTF-8 locale. The + * reason we above forbid optimization into something other than an ANYOF + * node is simply to minimize the number of code changes in regexec.c. + * Otherwise we would have to create new EXACTish node types and deal with + * them. This decision could be revisited should this construct become + * popular. + * + * (One might think we could look at the resulting ANYOF node and suppress + * the flag if everything is above 255, as those would be UTF-8 only, + * but this isn't true, as the components that led to that result could + * have been locale-affected, and just happen to cancel each other out + * under UTF-8 locales.) */ + if (in_locale) { + set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET); + + assert(OP(node) == ANYOF); + + OP(node) = ANYOFL; + ANYOF_FLAGS(node) |= ANYOF_LOC_REQ_UTF8; + } + if (save_fold) { RExC_flags |= RXf_PMf_FOLD; } + RExC_parse = save_parse + 1; RExC_end = save_end; SvREFCNT_dec_NN(final); @@ -13592,6 +14123,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ return node; } +#undef IS_OPERATOR #undef IS_OPERAND STATIC void @@ -13638,9 +14170,30 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); break; + +#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */ + case LATIN_SMALL_LETTER_SHARP_S: *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S); break; + +#endif + +#if UNICODE_MAJOR_VERSION < 3 \ + || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0) + + /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did + * U+0131. */ + case 'i': + case 'I': + *invlist = + add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); +# if UNICODE_DOT_DOT_VERSION == 1 + *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I); +# endif + break; +#endif + default: /* Use deprecated warning to increase the chances of this being * output */ @@ -13704,6 +14257,23 @@ S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN c #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \ (SvCUR(listsv) != initial_listsv_len) +/* There is a restricted set of white space characters that are legal when + * ignoring white space in a bracketed character class. This generates the + * code to skip them. + * + * There is a line below that uses the same white space criteria but is outside + * this macro. Both here and there must use the same definition */ +#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \ + STMT_START { \ + if (do_skip) { \ + while ( p < RExC_end \ + && isBLANK_A(UCHARAT(p))) \ + { \ + p++; \ + } \ + } \ + } STMT_END + STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const bool stop_at_1, /* Just parse the next thing, don't @@ -13713,6 +14283,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, about too large characters */ const bool strict, + bool optimizable, /* ? Allow a non-ANYOF return + node */ SV** ret_invlist /* Return an inversion list, not a node */ ) { @@ -13737,8 +14309,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * are extra bits for \w, etc. in locale ANYOFs, as what these match is not * determinable at compile time * - * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs - * to be restarted. This can only happen if ret_invlist is non-NULL. + * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs + * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded + * to UTF-8. This can only happen if ret_invlist is non-NULL. */ UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE; @@ -13833,11 +14406,19 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, DEBUG_PARSE("clas"); +#if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \ + && UNICODE_DOT_DOT_VERSION == 0) + allow_multi_folds = FALSE; +#endif + /* Assume we are going to generate an ANYOF node. */ ret = reganode(pRExC_state, (LOC) ? ANYOFL - : ANYOF, + : (DEPENDS_SEMANTICS) + ? ANYOFD + : ANYOF, 0); if (SIZE_ONLY) { @@ -13853,20 +14434,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ } - if (skip_white) { - RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */ ); - } + SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ RExC_parse++; invert = TRUE; allow_multi_folds = FALSE; MARK_NAUGHTY(1); - if (skip_white) { - RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */ ); - } + SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); } /* Check that they didn't say [:posix:] instead of [[:posix:]] */ @@ -13903,10 +14478,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, break; } - if (skip_white) { - RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */ ); - } + SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); if (UCHARAT(RExC_parse) == ']') { break; @@ -13959,7 +14531,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * skipped, it means that that white space is wanted literally, and * is already in 'value'. Otherwise, need to translate the escape * into what it signifies. */ - if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) { + if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) { case 'w': namedclass = ANYOF_WORDCHAR; break; case 'W': namedclass = ANYOF_NWORDCHAR; break; @@ -13973,14 +14545,26 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, case 'H': namedclass = ANYOF_NHORIZWS; break; case 'N': /* Handle \N{NAME} in class */ { - SV *as_text; - STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value, - flagp, depth, &as_text); - if (*flagp & RESTART_UTF8) - FAIL("panic: grok_bslash_N set RESTART_UTF8"); - if (cp_count != 1) { /* The typical case drops through */ - assert(cp_count != (STRLEN) -1); - if (cp_count == 0) { + const char * const backslash_N_beg = RExC_parse - 2; + int cp_count; + + if (! grok_bslash_N(pRExC_state, + NULL, /* No regnode */ + &value, /* Yes single value */ + &cp_count, /* Multiple code pt count */ + flagp, + depth) + ) { + + if (*flagp & NEED_UTF8) + FAIL("panic: grok_bslash_N set NEED_UTF8"); + if (*flagp & RESTART_PASS1) + return NULL; + + if (cp_count < 0) { + vFAIL("\\N in a character class must be a named character: \\N{...}"); + } + else if (cp_count == 0) { if (strict) { RExC_parse++; /* Position after the "}" */ vFAIL("Zero length \\N{}"); @@ -14000,16 +14584,18 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, else if (PASS2) { ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class"); } + break; /* contains the first code + point. Drop out of the switch to + process it */ } else { + SV * multi_char_N = newSVpvn(backslash_N_beg, + RExC_parse - backslash_N_beg); multi_char_matches = add_multi_match(multi_char_matches, - as_text, + multi_char_N, cp_count); } - break; /* contains the first code - point. Drop out of the switch to - process it */ } } /* End of cp_count != 1 */ @@ -14042,38 +14628,53 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, vFAIL2("Empty \\%c{}", (U8)value); if (*RExC_parse == '{') { const U8 c = (U8)value; - e = strchr(RExC_parse++, '}'); - if (!e) + e = strchr(RExC_parse, '}'); + if (!e) { + RExC_parse++; vFAIL2("Missing right brace on \\%c{}", c); - while (isSPACE(*RExC_parse)) - RExC_parse++; + } + + RExC_parse++; + while (isSPACE(*RExC_parse)) { + RExC_parse++; + } + + if (UCHARAT(RExC_parse) == '^') { + + /* toggle. (The rhs xor gets the single bit that + * differs between P and p; the other xor inverts just + * that bit) */ + value ^= 'P' ^ 'p'; + + RExC_parse++; + while (isSPACE(*RExC_parse)) { + RExC_parse++; + } + } + if (e == RExC_parse) vFAIL2("Empty \\%c{}", c); + n = e - RExC_parse; while (isSPACE(*(RExC_parse + n - 1))) n--; - } - else { + } /* The \p isn't immediately followed by a '{' */ + else if (! isALPHA(*RExC_parse)) { + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + vFAIL2("Character following \\%c must be '{' or a " + "single-character Unicode property name", + (U8) value); + } + else { e = RExC_parse; n = 1; } if (!SIZE_ONLY) { SV* invlist; char* name; + char* base_name; /* name after any packages are stripped */ + const char * const colon_colon = "::"; - if (UCHARAT(RExC_parse) == '^') { - RExC_parse++; - n--; - /* toggle. (The rhs xor gets the single bit that - * differs between P and p; the other xor inverts just - * that bit) */ - value ^= 'P' ^ 'p'; - - while (isSPACE(*RExC_parse)) { - RExC_parse++; - n--; - } - } /* Try to get the definition of the property into * . If /i is in effect, the effective property * will have its name be <__NAME_i>. The design is @@ -14089,7 +14690,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* Look up the property name, and get its swash and * inversion list, if the property is found */ - if (swash) { + if (swash) { /* Return any left-overs */ SvREFCNT_dec_NN(swash); } swash = _core_swash_init("utf8", name, &PL_sv_undef, @@ -14102,26 +14703,57 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, HV* curpkg = (IN_PERL_COMPILETIME) ? PL_curstash : CopSTASH(PL_curcop); - if (swash) { + UV final_n = n; + bool has_pkg; + + if (swash) { /* Got a swash but no inversion list. + Something is likely wrong that will + be sorted-out later */ SvREFCNT_dec_NN(swash); swash = NULL; } - /* Here didn't find it. It could be a user-defined - * property that will be available at run-time. If we - * accept only compile-time properties, is an error; - * otherwise add it to the list for run-time look up */ - if (ret_invlist) { + /* Here didn't find it. It could be a an error (like a + * typo) in specifying a Unicode property, or it could + * be a user-defined property that will be available at + * run-time. The names of these must begin with 'In' + * or 'Is' (after any packages are stripped off). So + * if not one of those, or if we accept only + * compile-time properties, is an error; otherwise add + * it to the list for run-time look up. */ + if ((base_name = rninstr(name, name + n, + colon_colon, colon_colon + 2))) + { /* Has ::. We know this must be a user-defined + property */ + base_name += 2; + final_n -= base_name - name; + has_pkg = TRUE; + } + else { + base_name = name; + has_pkg = FALSE; + } + + if ( final_n < 3 + || base_name[0] != 'I' + || (base_name[1] != 's' && base_name[1] != 'n') + || ret_invlist) + { + const char * const msg + = (has_pkg) + ? "Illegal user-defined property name" + : "Can't find Unicode property definition"; RExC_parse = e + 1; - vFAIL2utf8f( - "Property '%"UTF8f"' is unknown", - UTF8fARG(UTF, n, name)); + + /* diag_listed_as: Can't find Unicode property definition "%s" */ + vFAIL3utf8f("%s \"%"UTF8f"\"", + msg, UTF8fARG(UTF, n, name)); } /* If the property name doesn't already have a package * name, add the current one to it so that it can be * referred to outside it. [perl #121777] */ - if (curpkg && ! instr(name, "::")) { + if (! has_pkg && curpkg) { char* pkgname = HvNAME(curpkg); if (strNE(pkgname, "main")) { char* full_name = Perl_form(aTHX_ @@ -14137,11 +14769,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, (value == 'p' ? '+' : '!'), UTF8fARG(UTF, n, name)); has_user_defined_property = TRUE; + optimizable = FALSE; /* Will have to leave this an + ANYOF node */ /* We don't know yet, so have to assume that the - * property could match something in the Latin1 range, - * hence something that isn't utf8. Note that this - * would cause things in to match + * property could match something in the upper Latin1 + * range, hence something that isn't utf8. Note that + * this would cause things in to match * inappropriately, except that any \p{}, including * this one forces Unicode semantics, which means there * is no */ @@ -14194,7 +14828,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, named */ /* \p means they want Unicode semantics */ - RExC_uni_semantics = 1; + REQUIRE_UNI_RULES(flagp, NULL); } break; case 'n': value = '\n'; break; @@ -14282,7 +14916,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, recode_encoding: if (! RExC_override_recoding) { SV* enc = _get_encoding(); - value = reg_recode((const char)(U8)value, &enc); + value = reg_recode((U8)value, &enc); if (!enc) { if (strict) { vFAIL("Invalid escape in the specified encoding"); @@ -14371,6 +15005,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL; ANYOF_POSIXL_ZERO(ret); + + /* We can't change this into some other type of node + * (unless this is the only element, in which case there + * are nodes that mean exactly this) as has runtime + * dependencies */ + optimizable = FALSE; } /* Coverity thinks it is possible for this to be negative; both @@ -14486,10 +15126,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } } /* end of namedclass \blah */ - if (skip_white) { - RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */ ); - } + SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); /* If 'range' is set, 'value' is the ending of a range--check its * validity. (If value isn't a single code point in the case of a @@ -14521,7 +15158,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, vFAIL2utf8f( "Invalid [] range \"%"UTF8f"\"", UTF8fARG(UTF, w, rangebegin)); - NOT_REACHED; /* NOT REACHED */ + NOT_REACHED; /* NOTREACHED */ } } else { @@ -14530,12 +15167,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, && *RExC_parse == '-') { char* next_char_ptr = RExC_parse + 1; - if (skip_white) { /* Get the next real char after the '-' */ - next_char_ptr = regpatws(pRExC_state, - RExC_parse + 1, - FALSE); /* means don't recognize - comments */ - } + + /* Get the next real char after the '-' */ + SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr); /* If the '-' is at the end of the class (just before the ']', * it is a literal minus; otherwise it is a range */ @@ -14580,7 +15214,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* non-Latin1 code point implies unicode semantics. Must be set in * pass1 so is there for the whole of pass 2 */ if (value > 255) { - RExC_uni_semantics = 1; + REQUIRE_UNI_RULES(flagp, NULL); } /* Ready to process either the single value, or the completed range. @@ -14695,9 +15329,18 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * same element, neither should be a digit. */ if (index_start == index_final) { assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start) - || invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1] - - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start] - == 10); + || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1] + - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start] + == 10) + /* But actually Unicode did have one group of 11 + * 'digits' in 5.2, so in case we are operating + * on that version, let that pass */ + || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1] + - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start] + == 11 + && invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start] + == 0x19D0) + ); } else if ((index_start >= 0 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)) @@ -14853,7 +15496,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, ret = reg(pRExC_state, 1, ®_flags, depth+1); - *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8); + *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8); RExC_parse = save_parse; RExC_end = save_end; @@ -14873,8 +15516,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * 2) if the character class contains only a single element (including a * single range), we see if there is an equivalent node for it. * Other checks are possible */ - if (! ret_invlist /* Can't optimize if returning the constructed - inversion list */ + if ( optimizable + && ! ret_invlist /* Can't optimize if returning the constructed + inversion list */ && (UNLIKELY(posixl_matches_all) || element_count == 1)) { U8 op = END; @@ -14987,10 +15631,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, op = POSIXA; } } - else if (AT_LEAST_ASCII_RESTRICTED || ! FOLD) { + else if (! FOLD || ASCII_FOLD_RESTRICTED) { /* We can optimize A-Z or a-z, but not if they could match - * something like the KELVIN SIGN under /i (/a means they - * can't) */ + * something like the KELVIN SIGN under /i. */ if (prevvalue == 'A') { if (value == 'Z' #ifdef EBCDIC @@ -15086,7 +15729,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* Our calculated list will be for Unicode rules. For locale * matching, we have to keep a separate list that is consulted at * runtime only when the locale indicates Unicode rules. For - * non-locale, we just use to the general list */ + * non-locale, we just use the general list */ if (LOC) { use_list = &only_utf8_locale_list; } @@ -15236,7 +15879,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (DEPENDS_SEMANTICS) { /* Under /d, everything in the upper half of the Latin1 range * matches these complements */ - ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII; + ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; } else if (AT_LEAST_ASCII_RESTRICTED) { /* Under /a and /aa, everything above ASCII matches these @@ -15323,7 +15966,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } if (warn_super) { - ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER; + ANYOF_FLAGS(ret) + |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; + + /* Because an ANYOF node is the only one that warns, this node + * can't be optimized into something else */ + optimizable = FALSE; } } @@ -15345,8 +15993,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (only_utf8_locale_list) { ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; } - else if (cp_list) { /* Look to see if there a 0-255 code point is in - the list */ + else if (cp_list) { /* Look to see if a 0-255 code point is in list */ UV start, end; invlist_iterinit(cp_list); if (invlist_iternext(cp_list, &start, &end) && start < 256) { @@ -15379,6 +16026,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } if (ret_invlist) { + assert(cp_list); + *ret_invlist = cp_list; SvREFCNT_dec(swash); @@ -15403,21 +16052,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * adjacent such nodes. And if the class is equivalent to things like /./, * expensive run-time swashes can be avoided. Now that we have more * complete information, we can find things necessarily missed by the - * earlier code. I (khw) am not sure how much to look for here. It would - * be easy, but perhaps too slow, to check any candidates against all the - * node types they could possibly match using _invlistEQ(). */ - - if (cp_list - && ! invert - && ! depends_list - && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION - - /* We don't optimize if we are supposed to make sure all non-Unicode - * code points raise a warning, as only ANYOF nodes have this check. - * */ - && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER)) - { + * earlier code. I (khw) did some benchmarks and found essentially no + * speed difference between using a POSIXA node versus an ANYOF node, so + * there is no reason to optimize, for example [A-Za-z0-9_] into + * [[:word:]]/a (although if we did it in the sizing pass it would save + * space). _invlistEQ() could be used if one ever wanted to do something + * like this at this point in the code */ + + if (optimizable && cp_list && ! invert && ! depends_list) { UV start, end; U8 op = END; /* The optimzation node-type */ const char * cur_parse= RExC_parse; @@ -15426,9 +16068,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (! invlist_iternext(cp_list, &start, &end)) { /* Here, the list is empty. This happens, for example, when a - * Unicode property is the only thing in the character class, and - * it doesn't match anything. (perluniprops.pod notes such - * properties) */ + * Unicode property that doesn't match anything is the only element + * in the character class (perluniprops.pod notes such properties). + * */ op = OPFAIL; *flagp |= HASWIDTH|SIMPLE; } @@ -15484,7 +16126,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } } } - } + } /* End of first range contains just a single code point */ else if (start == 0) { if (end == UV_MAX) { op = SANY; @@ -15506,7 +16148,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, RExC_parse = (char *)orig_parse; RExC_emit = (regnode *)orig_emit; - ret = reg_node(pRExC_state, op); + if (regarglen[op]) { + ret = reganode(pRExC_state, op, 0); + } else { + ret = reg_node(pRExC_state, op); + } RExC_parse = (char *)cur_parse; @@ -15815,50 +16461,86 @@ S_reg_skipcomment(RExC_state_t *pRExC_state, char* p) return p; } -/* nextchar() - - Advances the parse position, and optionally absorbs - "whitespace" from the inputstream. +STATIC void +S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state, + char ** p, + const bool force_to_xmod + ) +{ + /* If the text at the current parse position '*p' is a '(?#...)' comment, + * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p' + * is /x whitespace, advance '*p' so that on exit it points to the first + * byte past all such white space and comments */ - Without /x "whitespace" means (?#...) style comments only, - with /x this means (?#...) and # comments and whitespace proper. + const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED); - Returns the RExC_parse point from BEFORE the scan occurs. + PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT; - This is the /x friendly way of saying RExC_parse++. -*/ - -STATIC char* -S_nextchar(pTHX_ RExC_state_t *pRExC_state) -{ - char* const retval = RExC_parse++; - - PERL_ARGS_ASSERT_NEXTCHAR; + assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p)); for (;;) { - if (RExC_end - RExC_parse >= 3 - && *RExC_parse == '(' - && RExC_parse[1] == '?' - && RExC_parse[2] == '#') + if (RExC_end - (*p) >= 3 + && *(*p) == '(' + && *(*p + 1) == '?' + && *(*p + 2) == '#') { - while (*RExC_parse != ')') { - if (RExC_parse == RExC_end) + while (*(*p) != ')') { + if ((*p) == RExC_end) FAIL("Sequence (?#... not terminated"); - RExC_parse++; + (*p)++; } - RExC_parse++; + (*p)++; continue; } - if (RExC_flags & RXf_PMf_EXTENDED) { - char * p = regpatws(pRExC_state, RExC_parse, - TRUE); /* means recognize comments */ - if (p != RExC_parse) { - RExC_parse = p; + + if (use_xmod) { + const char * save_p = *p; + while ((*p) < RExC_end) { + STRLEN len; + if ((len = is_PATWS_safe((*p), RExC_end, UTF))) { + (*p) += len; + } + else if (*(*p) == '#') { + (*p) = reg_skipcomment(pRExC_state, (*p)); + } + else { + break; + } + } + if (*p != save_p) { continue; } } - return retval; + + break; } + + return; +} + +/* nextchar() + + Advances the parse position by one byte, unless that byte is the beginning + of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In + those two cases, the parse position is advanced beyond all such comments and + white space. + + This is the UTF, (?#...), and /x friendly way of saying RExC_parse++. +*/ + +STATIC void +S_nextchar(pTHX_ RExC_state_t *pRExC_state) +{ + PERL_ARGS_ASSERT_NEXTCHAR; + + assert( ! UTF + || UTF8_IS_INVARIANT(*RExC_parse) + || UTF8_IS_START(*RExC_parse)); + + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + FALSE /* Don't assume /x */ ); } STATIC regnode * @@ -16347,7 +17029,7 @@ Perl_regdump(pTHX_ const regexp *r) PerlIO_printf(Perl_debug_log, "(SBOL)"); if (r->intflags & PREGf_ANCH_GPOS) PerlIO_printf(Perl_debug_log, "(GPOS)"); - PerlIO_putc(Perl_debug_log, ' '); + (void)PerlIO_putc(Perl_debug_log, ' '); } if (r->intflags & PREGf_GPOS_SEEN) PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); @@ -16386,8 +17068,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \ || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \ || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \ - || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \ - || _CC_VERTSPACE != 16 + || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15 #error Need to adjust order of anyofs[] #endif "\\w", @@ -16416,8 +17097,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ "[:^blank:]", "[:xdigit:]", "[:^xdigit:]", - "[:space:]", - "[:^space:]", "[:cntrl:]", "[:^cntrl:]", "[:ascii:]", @@ -16489,9 +17168,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ } } else if (k == CURLY) { + U32 lo = ARG1(o), hi = ARG2(o); if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ - Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o)); + Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo); + if (hi == REG_INFTY) + sv_catpvs(sv, "INFTY"); + else + Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi); + sv_catpvs(sv, "}"); } else if (k == WHILEM && o->flags) /* Ordinal/of */ Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); @@ -16499,7 +17184,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ || k == GROUPP || OP(o)==ACCEPT) { AV *name_list= NULL; - Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ + U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o); + Perl_sv_catpvf(aTHX_ sv, "%"UVuf, (UV)parno); /* Parenth number */ if ( RXp_PAREN_NAMES(prog) ) { name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); } else if ( pRExC_state ) { @@ -16507,12 +17193,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ } if (name_list) { if ( k != REF || (OP(o) < NREF)) { - SV **name= av_fetch(name_list, ARG(o), 0 ); + SV **name= av_fetch(name_list, parno, 0 ); if (name) Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); } else { - SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]); + SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]); I32 *nums=(I32*)SvPVX(sv_dat); SV **name= av_fetch(name_list, nums[0], 0 ); I32 n; @@ -16555,11 +17241,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); } } - else if (k == VERB) { - if (!o->flags) - Perl_sv_catpvf(aTHX_ sv, ":%"SVf, - SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); - } else if (k == LOGICAL) + else if (k == LOGICAL) /* 2: embedded, otherwise 1 */ Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); else if (k == ANYOF) { @@ -16568,8 +17250,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ SV* bitmap_invlist; /* Will hold what the bit map contains */ - if (OP(o) == ANYOFL) - sv_catpvs(sv, "{loc}"); + if (OP(o) == ANYOFL) { + if (flags & ANYOF_LOC_REQ_UTF8) { + sv_catpvs(sv, "{utf8-loc}"); + } + else { + sv_catpvs(sv, "{loc}"); + } + } if (flags & ANYOF_LOC_FOLD) sv_catpvs(sv, "{i}"); Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); @@ -16605,7 +17293,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ sv_catpvs(sv, "^"); } - if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) { + if (OP(o) == ANYOFD + && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)) + { sv_catpvs(sv, "{non-utf8-latin1-all}"); } @@ -16716,10 +17406,26 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); } } + else if (k == BOUND || k == NBOUND) { + /* Must be synced with order of 'bound_type' in regcomp.h */ + const char * const bounds[] = { + "", /* Traditional */ + "{gcb}", + "{sb}", + "{wb}" + }; + sv_catpv(sv, bounds[FLAGS(o)]); + } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); else if (OP(o) == SBOL) Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^"); + + /* add on the verb argument if there is one */ + if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) { + Perl_sv_catpvf(aTHX_ sv, ":%"SVf, + SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); + } #else PERL_UNUSED_CONTEXT; PERL_UNUSED_ARG(sv); @@ -17294,6 +18000,47 @@ S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...) Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf)); } +/* XXX Here's a total kludge. But we need to re-enter for swash routines. */ + +#ifndef PERL_IN_XSUB_RE +void +Perl_save_re_context(pTHX) +{ + I32 nparens = -1; + I32 i; + + /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */ + + if (PL_curpm) { + const REGEXP * const rx = PM_GETRE(PL_curpm); + if (rx) + nparens = RX_NPARENS(rx); + } + + /* RT #124109. This is a complete hack; in the SWASHNEW case we know + * that PL_curpm will be null, but that utf8.pm and the modules it + * loads will only use $1..$3. + * The t/porting/re_context.t test file checks this assumption. + */ + if (nparens == -1) + nparens = 3; + + for (i = 1; i <= nparens; i++) { + char digits[TYPE_CHARS(long)]; + const STRLEN len = my_snprintf(digits, sizeof(digits), + "%lu", (long)i); + GV *const *const gvp + = (GV**)hv_fetch(PL_defstash, digits, len, 0); + + if (gvp) { + GV * const gv = *gvp; + if (SvTYPE(gv) == SVt_PVGV && GvSV(gv)) + save_scalar(gv); + } + } +} +#endif + #ifdef DEBUGGING STATIC void @@ -17470,9 +18217,13 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) this_end = (end < NUM_ANYOF_CODE_POINTS) ? end : NUM_ANYOF_CODE_POINTS - 1; +#if NUM_ANYOF_CODE_POINTS > 256 format = (this_end < 256) ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}" : "\\x{%04"UVXf"}-\\x{%04"UVXf"}"; +#else + format = "\\x{%02"UVXf"}-\\x{%02"UVXf"}"; +#endif GCC_DIAG_IGNORE(-Wformat-nonliteral); Perl_sv_catpvf(aTHX_ sv, format, start, this_end); GCC_DIAG_RESTORE; @@ -17758,11 +18509,5 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, #endif /* DEBUGGING */ /* - * Local variables: - * c-indentation-style: bsd - * c-basic-offset: 4 - * indent-tabs-mode: nil - * End: - * * ex: set ts=8 sts=4 sw=4 et: */