X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/84679df57ca0626f7fb35fc3038e2e142b97f8a4..2867cdbcac19071f99ab71a81d63dbd894cebd3b:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 775049d..56d7e55 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2,7 +2,9 @@ */ /* - * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee + * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee + * + * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"] */ /* This file contains functions for compiling a regular expression. See @@ -57,7 +59,8 @@ **** Alterations to Henry's code are... **** **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, - **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others + **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 + **** by Larry Wall and others **** **** You may distribute under the terms of either the GNU General Public **** License or the Artistic License, as specified in the README file. @@ -102,6 +105,7 @@ typedef struct RExC_state_t { U32 flags; /* are we folding, multilining? */ char *precomp; /* uncompiled string. */ + REGEXP *rx_sv; /* The SV that is the regexp. */ regexp *rx; /* perl core regexp structure */ regexp_internal *rxi; /* internal data for regexp object pprivate field */ char *start; /* Start of input for compile */ @@ -128,7 +132,6 @@ typedef struct RExC_state_t { I32 orig_utf8; /* whether the pattern was originally in utf8 */ /* XXX use this for future optimisation of case * where pattern must be upgraded to utf8. */ - HV *charnames; /* cache of named sequences */ HV *paren_names; /* Paren names */ regnode **recurse; /* Recurse regops */ @@ -149,6 +152,7 @@ typedef struct RExC_state_t { #define RExC_flags (pRExC_state->flags) #define RExC_precomp (pRExC_state->precomp) +#define RExC_rx_sv (pRExC_state->rx_sv) #define RExC_rx (pRExC_state->rx) #define RExC_rxi (pRExC_state->rxi) #define RExC_start (pRExC_state->start) @@ -172,7 +176,6 @@ typedef struct RExC_state_t { #define RExC_seen_evals (pRExC_state->seen_evals) #define RExC_utf8 (pRExC_state->utf8) #define RExC_orig_utf8 (pRExC_state->orig_utf8) -#define RExC_charnames (pRExC_state->charnames) #define RExC_open_parens (pRExC_state->open_parens) #define RExC_close_parens (pRExC_state->close_parens) #define RExC_opend (pRExC_state->opend) @@ -389,7 +392,7 @@ static const scan_data_t zero_scan_data = IV len = RExC_end - RExC_precomp; \ \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ if (len > RegexLengthToShowInErrorMessages) { \ /* chop 10 shorter than the max, to ensure meaning of "..." */ \ len = RegexLengthToShowInErrorMessages - 10; \ @@ -420,7 +423,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL(m) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ Simple_vFAIL(m); \ } STMT_END @@ -438,7 +441,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL2(m,a1) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ Simple_vFAIL2(m, a1); \ } STMT_END @@ -457,7 +460,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL3(m,a1,a2) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ Simple_vFAIL3(m, a1, a2); \ } STMT_END @@ -470,23 +473,22 @@ static const scan_data_t zero_scan_data = (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END -#define vWARN(loc,m) STMT_START { \ +#define ckWARNreg(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END -#define vWARNdep(loc,m) STMT_START { \ +#define ckWARNregdep(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ - "%s" REPORT_LOCATION, \ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ + m REPORT_LOCATION, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END - -#define vWARN2(loc, m, a1) STMT_START { \ +#define ckWARN2reg(loc, m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END @@ -496,12 +498,24 @@ static const scan_data_t zero_scan_data = a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END +#define ckWARN3reg(loc, m, a1, a2) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + #define vWARN4(loc, m, a1, a2, a3) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END +#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ @@ -630,6 +644,8 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min const STRLEN old_l = CHR_SVLEN(*data->longest); GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_SCAN_COMMIT; + if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { SvSetMagicSV(*data->longest, data->last_found); if (*data->longest == data->longest_fixed) { @@ -676,6 +692,8 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min STATIC void S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) { + PERL_ARGS_ASSERT_CL_ANYTHING; + ANYOF_CLASS_ZERO(cl); ANYOF_BITMAP_SETALL(cl); cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL; @@ -689,6 +707,8 @@ S_cl_is_anything(const struct regnode_charclass_class *cl) { int value; + PERL_ARGS_ASSERT_CL_IS_ANYTHING; + for (value = 0; value <= ANYOF_MAX; value += 2) if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) return 1; @@ -703,6 +723,8 @@ S_cl_is_anything(const struct regnode_charclass_class *cl) STATIC void S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) { + PERL_ARGS_ASSERT_CL_INIT; + Zero(cl, 1, struct regnode_charclass_class); cl->type = ANYOF; cl_anything(pRExC_state, cl); @@ -711,6 +733,8 @@ S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) STATIC void S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) { + PERL_ARGS_ASSERT_CL_INIT_ZERO; + Zero(cl, 1, struct regnode_charclass_class); cl->type = ANYOF; cl_anything(pRExC_state, cl); @@ -724,6 +748,7 @@ STATIC void S_cl_and(struct regnode_charclass_class *cl, const struct regnode_charclass_class *and_with) { + PERL_ARGS_ASSERT_CL_AND; assert(and_with->type == ANYOF); if (!(and_with->flags & ANYOF_CLASS) @@ -762,6 +787,8 @@ S_cl_and(struct regnode_charclass_class *cl, STATIC void S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) { + PERL_ARGS_ASSERT_CL_OR; + if (or_with->flags & ANYOF_INVERT) { /* We do not use * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) @@ -853,6 +880,7 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, int colwidth= widecharmap ? 6 : 4; GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_DUMP_TRIE; PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ", (int)depth * 2 + 2,"", @@ -935,6 +963,9 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, SV *sv=sv_newmortal(); int colwidth= widecharmap ? 6 : 4; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; + /* print out the table precompression. */ PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s", (int)depth * 2 + 2,"", (int)depth * 2 + 2,"", @@ -990,6 +1021,8 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, SV *sv=sv_newmortal(); int colwidth= widecharmap ? 6 : 4; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; /* print out the table precompression so that we can do a visual check @@ -1230,10 +1263,9 @@ is the recommended Unicode-aware way of saying /* store the word for dumping */ \ SV* tmp; \ if (OP(noper) != NOTHING) \ - tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \ + tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ else \ - tmp = newSVpvn( "", 0 ); \ - if ( UTF ) SvUTF8_on( tmp ); \ + tmp = newSVpvn_utf8( "", 0, UTF ); \ av_push( trie_words, tmp ); \ }); \ \ @@ -1318,6 +1350,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs #endif SV *re_trie_maxbuff; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_MAKE_TRIE; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif @@ -1973,7 +2007,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( folder ) TRIE_BITMAP_SET(trie, folder[ *ch ]); DEBUG_OPTIMISE_r( - PerlIO_printf(Perl_debug_log, (char*)ch) + PerlIO_printf(Perl_debug_log, "%s", (char*)ch) ); } } @@ -2162,6 +2196,8 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode reg_ac_data *aho; const U32 data_slot = add_data( pRExC_state, 1, "T" ); GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif @@ -2278,6 +2314,8 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags #else PERL_UNUSED_ARG(depth); #endif + + PERL_ARGS_ASSERT_JOIN_EXACT; #ifndef EXPERIMENTAL_INPLACESCAN PERL_UNUSED_ARG(flags); PERL_UNUSED_ARG(val); @@ -2488,9 +2526,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, regnode *first_non_open = scan; I32 stopmin = I32_MAX; scan_frame *frame = NULL; - GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_STUDY_CHUNK; + #ifdef DEBUGGING StructCopy(&zero_scan_data, &data_fake, scan_data_t); #endif @@ -2792,13 +2831,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } else { /* - Currently we assume that the trie can handle unicode and ascii - matches fold cased matches. If this proves true then the following - define will prevent tries in this situation. - - #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT) -*/ + Currently we do not believe that the trie logic can + handle case insensitive matching properly when the + pattern is not unicode (thus forcing unicode semantics). + + If/when this is fixed the following define can be swapped + in below to fully enable trie logic. + #define TRIE_TYPE_IS_SAFE 1 + +*/ +#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT) + if ( last && TRIE_TYPE_IS_SAFE ) { make_trie( pRExC_state, startbranch, first, cur, tail, count, @@ -3149,11 +3193,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, (next_is_eval || !(mincount == 0 && maxcount == 1)) && (minnext == 0) && (deltanext == 0) && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) - && maxcount <= REG_INFTY/3 /* Complement check for big count */ - && ckWARN(WARN_REGEXP)) + && maxcount <= REG_INFTY/3) /* Complement check for big count */ { - vWARN(RExC_parse, - "Quantifier unexpected on zero-length expression"); + ckWARNreg(RExC_parse, + "Quantifier unexpected on zero-length expression"); } min += minnext * mincount; @@ -3318,9 +3361,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, l -= old; /* Get the added string: */ - last_str = newSVpvn(s + old, l); - if (UTF) - SvUTF8_on(last_str); + last_str = newSVpvn_utf8(s + old, l, UTF); if (deltanext == 0 && pos_before == b) { /* What was added is a constant string */ if (mincount > 1) { @@ -3699,11 +3740,22 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->whilem_c = data_fake.whilem_c; } if (f & SCF_DO_STCLASS_AND) { - const int was = (data->start_class->flags & ANYOF_EOS); - - cl_and(data->start_class, &intrnl); - if (was) - data->start_class->flags |= ANYOF_EOS; + if (flags & SCF_DO_STCLASS_OR) { + /* OR before, AND after: ideally we would recurse with + * data_fake to get the AND applied by study of the + * remainder of the pattern, and then derecurse; + * *** HACK *** for now just treat as "no information". + * See [perl #56690]. + */ + cl_init(pRExC_state, data->start_class); + } else { + /* AND before and after: combine and continue */ + const int was = (data->start_class->flags & ANYOF_EOS); + + cl_and(data->start_class, &intrnl); + if (was) + data->start_class->flags |= ANYOF_EOS; + } } } #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY @@ -4050,6 +4102,8 @@ S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s) { U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; + PERL_ARGS_ASSERT_ADD_DATA; + Renewc(RExC_rxi->data, sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1), char, struct reg_data); @@ -4129,10 +4183,13 @@ extern const struct regexp_engine my_reg_engine; #ifndef PERL_IN_XSUB_RE REGEXP * -Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags) +Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) { dVAR; HV * const table = GvHV(PL_hintgv); + + PERL_ARGS_ASSERT_PREGCOMP; + /* Dispatch a request to compile a regexp to correct regexp engine. */ if (table) { @@ -4152,13 +4209,14 @@ Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags) #endif REGEXP * -Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags) +Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags) { dVAR; - register REGEXP *r; + REGEXP *rx; + struct regexp *r; register regexp_internal *ri; STRLEN plen; - char* exp = SvPV((SV*)pattern, plen); + char *exp = SvPV(pattern, plen); char* xend = exp + plen; regnode *scan; I32 flags; @@ -4173,9 +4231,12 @@ Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags) RExC_state_t copyRExC_state; #endif GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_COMPILE; + DEBUG_r(if (!PL_colorset) reginitcolors()); - RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8; + RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern); DEBUG_COMPILE_r({ SV *dsv= sv_newmortal(); @@ -4205,7 +4266,6 @@ redo_first_pass: RExC_size = 0L; RExC_emit = &PL_regdummy; RExC_whilem_seen = 0; - RExC_charnames = NULL; RExC_open_parens = NULL; RExC_close_parens = NULL; RExC_opend = NULL; @@ -4264,7 +4324,8 @@ redo_first_pass: /* Allocate space and zero-initialize. Note, the two step process of zeroing when in debug mode, thus anything assigned has to happen after that */ - Newxz(r, 1, regexp); + rx = (REGEXP*) newSV_type(SVt_REGEXP); + r = (struct regexp*)SvANY(rx); Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char, regexp_internal); if ( r == NULL || ri == NULL ) @@ -4280,7 +4341,6 @@ redo_first_pass: /* non-zero initialization begins here */ RXi_SET( r, ri ); r->engine= RE_ENGINE_PTR; - r->refcnt = 1; r->extflags = pm_flags; { bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); @@ -4290,12 +4350,14 @@ redo_first_pass: >> RXf_PMf_STD_PMMOD_SHIFT); const char *fptr = STD_PAT_MODS; /*"msix"*/ char *p; - RXp_WRAPLEN(r) = plen + has_minus + has_p + has_runon + const STRLEN wraplen = plen + has_minus + has_p + has_runon + (sizeof(STD_PAT_MODS) - 1) + (sizeof("(?:)") - 1); - Newx(RXp_WRAPPED(r), RXp_WRAPLEN(r) + 1, char ); - p = RXp_WRAPPED(r); + p = sv_grow(MUTABLE_SV(rx), wraplen + 1); + SvCUR_set(rx, wraplen); + SvPOK_on(rx); + SvFLAGS(rx) |= SvUTF8(pattern); *p++='('; *p++='?'; if (has_p) *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ @@ -4319,8 +4381,8 @@ redo_first_pass: *p++ = ':'; Copy(RExC_precomp, p, plen, char); - assert ((RXp_WRAPPED(r) - p) < 16); - r->pre_prefix = p - RXp_WRAPPED(r); + assert ((RX_WRAPPED(rx) - p) < 16); + r->pre_prefix = p - RX_WRAPPED(rx); p += plen; if (has_runon) *p++ = '\n'; @@ -4347,6 +4409,7 @@ redo_first_pass: (UV)((2*RExC_size+1) * sizeof(U32)))); #endif SetProgLen(ri,RExC_size); + RExC_rx_sv = rx; RExC_rx = r; RExC_rxi = ri; @@ -4364,7 +4427,7 @@ redo_first_pass: RExC_rx->seen_evals = RExC_seen_evals; REGC((U8)REG_MAGIC, (char*) RExC_emit++); if (reg(pRExC_state, 0, &flags,1) == NULL) { - ReREFCNT_dec(r); + ReREFCNT_dec(rx); return(NULL); } /* XXXX To minimize changes to RE engine we always allocate @@ -4380,7 +4443,10 @@ reStudy: Zero(r->substrs, 1, struct reg_substr_data); #ifdef TRIE_STUDY_OPT - if ( restudied ) { + if (!restudied) { + StructCopy(&zero_scan_data, &data, scan_data_t); + copyRExC_state = RExC_state; + } else { U32 seen=RExC_seen; DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); @@ -4395,9 +4461,6 @@ reStudy: SvREFCNT_dec(data.last_found); } StructCopy(&zero_scan_data, &data, scan_data_t); - } else { - StructCopy(&zero_scan_data, &data, scan_data_t); - copyRExC_state = RExC_state; } #else StructCopy(&zero_scan_data, &data, scan_data_t); @@ -4408,7 +4471,7 @@ reStudy: /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ if (UTF) - r->extflags |= RXf_UTF8; /* Unicode in it? */ + SvUTF8_on(rx); /* Unicode in it? */ ri->regstclass = NULL; if (RExC_naughty >= 10) /* Probably an expensive pattern. */ r->intflags |= PREGf_NAUGHTY; @@ -4425,7 +4488,17 @@ reStudy: regnode *first= scan; regnode *first_next= regnext(first); - /* Skip introductions and multiplicators >= 1. */ + /* + * Skip introductions and multiplicators >= 1 + * so that we can extract the 'meat' of the pattern that must + * match in the large if() sequence following. + * NOTE that EXACT is NOT covered here, as it is normally + * picked up by the optimiser separately. + * + * This is unfortunate as the optimiser isnt handling lookahead + * properly currently. + * + */ while ((OP(first) == OPEN && (sawopen = 1)) || /* An OR of *one* alternative - should not happen now. */ (OP(first) == BRANCH && OP(first_next) != BRANCH) || @@ -4437,16 +4510,17 @@ reStudy: (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) { - + /* + * the only op that could be a regnode is PLUS, all the rest + * will be regnode_1 or regnode_2. + * + */ if (OP(first) == PLUS) sawplus = 1; else first += regarglen[OP(first)]; - if (OP(first) == IFMATCH) { - first = NEXTOPER(first); - first += EXTRA_STEP_2ARGS; - } else /* XXX possible optimisation for /(?=)/ */ - first = NEXTOPER(first); + + first = NEXTOPER(first); first_next= regnext(first); } @@ -4790,22 +4864,22 @@ reStudy: if (RExC_seen & REG_SEEN_CUTGROUP) r->intflags |= PREGf_CUTGROUP_SEEN; if (RExC_paren_names) - r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names); + RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names)); else - r->paren_names = NULL; + RXp_PAREN_NAMES(r) = NULL; #ifdef STUPID_PATTERN_CHECKS - if (RX_PRELEN(r) == 0) + if (RX_PRELEN(rx) == 0) r->extflags |= RXf_NULL; - if (r->extflags & RXf_SPLIT && RX_PRELEN(r) == 1 && RXp_PRECOMP(r)[0] == ' ') + if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ') /* XXX: this should happen BEFORE we compile */ r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); - else if (RX_PRELEN(r) == 3 && memEQ("\\s+", RXp_PRECOMP(r), 3)) + else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3)) r->extflags |= RXf_WHITE; - else if (RX_PRELEN(r) == 1 && RXp_PRECOMP(r)[0] == '^') + else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^') r->extflags |= RXf_START_ONLY; #else - if (r->extflags & RXf_SPLIT && RXp_PRELEN(r) == 1 && RXp_PRECOMP(r)[0] == ' ') + if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ') /* XXX: this should happen BEFORE we compile */ r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); else { @@ -4856,7 +4930,7 @@ reStudy: PerlIO_printf(Perl_debug_log, "\n"); }); #endif - return(r); + return rx; } #undef RE_ENGINE_PTR @@ -4866,12 +4940,14 @@ SV* Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, const U32 flags) { + PERL_ARGS_ASSERT_REG_NAMED_BUFF; + PERL_UNUSED_ARG(value); if (flags & RXapif_FETCH) { return reg_named_buff_fetch(rx, key, flags); } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { - Perl_croak(aTHX_ PL_no_modify); + Perl_croak(aTHX_ "%s", PL_no_modify); return NULL; } else if (flags & RXapif_EXISTS) { return reg_named_buff_exists(rx, key, flags) @@ -4891,6 +4967,7 @@ SV* Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, const U32 flags) { + PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER; PERL_UNUSED_ARG(lastkey); if (flags & RXapif_FIRSTKEY) @@ -4904,15 +4981,20 @@ Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, } SV* -Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 flags) +Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, + const U32 flags) { AV *retarray = NULL; SV *ret; + struct regexp *const rx = (struct regexp *)SvANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; + if (flags & RXapif_ALL) retarray=newAV(); - if (rx && rx->paren_names) { - HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 ); + if (rx && RXp_PAREN_NAMES(rx)) { + HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); if (he_str) { IV i; SV* sv_dat=HeVAL(he_str); @@ -4923,33 +5005,35 @@ Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 && rx->offs[nums[i]].end != -1) { ret = newSVpvs(""); - CALLREG_NUMBUF_FETCH(rx,nums[i],ret); + CALLREG_NUMBUF_FETCH(r,nums[i],ret); if (!retarray) return ret; } else { ret = newSVsv(&PL_sv_undef); } - if (retarray) { - SvREFCNT_inc_simple_void(ret); + if (retarray) av_push(retarray, ret); - } } if (retarray) - return newRV((SV*)retarray); + return newRV_noinc(MUTABLE_SV(retarray)); } } return NULL; } bool -Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key, +Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, const U32 flags) { - if (rx && rx->paren_names) { + struct regexp *const rx = (struct regexp *)SvANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; + + if (rx && RXp_PAREN_NAMES(rx)) { if (flags & RXapif_ALL) { - return hv_exists_ent(rx->paren_names, key, 0); + return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); } else { - SV *sv = CALLREG_NAMED_BUFF_FETCH(rx, key, flags); + SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags); if (sv) { SvREFCNT_dec(sv); return TRUE; @@ -4963,22 +5047,31 @@ Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key, } SV* -Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const rx, const U32 flags) +Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) { - if ( rx && rx->paren_names ) { - (void)hv_iterinit(rx->paren_names); + struct regexp *const rx = (struct regexp *)SvANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; + + if ( rx && RXp_PAREN_NAMES(rx) ) { + (void)hv_iterinit(RXp_PAREN_NAMES(rx)); - return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY); + return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY); } else { return FALSE; } } SV* -Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags) +Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) { - if (rx && rx->paren_names) { - HV *hv = rx->paren_names; + struct regexp *const rx = (struct regexp *)SvANY(r); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv = RXp_PAREN_NAMES(rx); HE *temphe; while ( (temphe = hv_iternext_flags(hv,0)) ) { IV i; @@ -4986,7 +5079,7 @@ Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags) SV* sv_dat = HeVAL(temphe); I32 *nums = (I32*)SvPVX(sv_dat); for ( i = 0; i < SvIVX(sv_dat); i++ ) { - if ((I32)(rx->lastcloseparen) >= nums[i] && + if ((I32)(rx->lastparen) >= nums[i] && rx->offs[nums[i]].start != -1 && rx->offs[nums[i]].end != -1) { @@ -4995,9 +5088,7 @@ Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags) } } if (parno || flags & RXapif_ALL) { - STRLEN len; - char *pv = HePV(temphe, len); - return newSVpvn(pv,len); + return newSVhek(HeKEY_hek(temphe)); } } } @@ -5005,19 +5096,23 @@ Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags) } SV* -Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags) +Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) { SV *ret; AV *av; I32 length; + struct regexp *const rx = (struct regexp *)SvANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; - if (rx && rx->paren_names) { + if (rx && RXp_PAREN_NAMES(rx)) { if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { - return newSViv(HvTOTALKEYS(rx->paren_names)); + return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); } else if (flags & RXapif_ONE) { - ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXapif_REGNAMES)); - av = (AV*)SvRV(ret); + ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); + av = MUTABLE_AV(SvRV(ret)); length = av_len(av); + SvREFCNT_dec(ret); return newSViv(length + 1); } else { Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags); @@ -5028,12 +5123,15 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags) } SV* -Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags) +Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) { + struct regexp *const rx = (struct regexp *)SvANY(r); AV *av = newAV(); - if (rx && rx->paren_names) { - HV *hv= rx->paren_names; + PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv= RXp_PAREN_NAMES(rx); HE *temphe; (void)hv_iterinit(hv); while ( (temphe = hv_iternext_flags(hv,0)) ) { @@ -5042,7 +5140,7 @@ Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags) SV* sv_dat = HeVAL(temphe); I32 *nums = (I32*)SvPVX(sv_dat); for ( i = 0; i < SvIVX(sv_dat); i++ ) { - if ((I32)(rx->lastcloseparen) >= nums[i] && + if ((I32)(rx->lastparen) >= nums[i] && rx->offs[nums[i]].start != -1 && rx->offs[nums[i]].end != -1) { @@ -5051,22 +5149,24 @@ Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags) } } if (parno || flags & RXapif_ALL) { - STRLEN len; - char *pv = HePV(temphe, len); - av_push(av, newSVpvn(pv,len)); + av_push(av, newSVhek(HeKEY_hek(temphe))); } } } - return newRV((SV*)av); + return newRV_noinc(MUTABLE_SV(av)); } void -Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * const sv) +Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, + SV * const sv) { + struct regexp *const rx = (struct regexp *)SvANY(r); char *s = NULL; I32 i = 0; I32 s1, t1; + + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; if (!rx->subbeg) { sv_setsv(sv,&PL_sv_undef); @@ -5140,21 +5240,26 @@ void Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, SV const * const value) { + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE; + PERL_UNUSED_ARG(rx); PERL_UNUSED_ARG(paren); PERL_UNUSED_ARG(value); if (!PL_localizing) - Perl_croak(aTHX_ PL_no_modify); + Perl_croak(aTHX_ "%s", PL_no_modify); } I32 -Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv, +Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, const I32 paren) { + struct regexp *const rx = (struct regexp *)SvANY(r); I32 i; I32 s1, t1; + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; + /* Some of this code was originally in C in F */ switch (paren) { /* $` / ${^PREMATCH} */ @@ -5189,7 +5294,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv, goto getlen; } else { if (ckWARN(WARN_UNINITIALIZED)) - report_uninit((SV*)sv); + report_uninit((const SV *)sv); return 0; } } @@ -5209,8 +5314,12 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv, SV* Perl_reg_qr_package(pTHX_ REGEXP * const rx) { + PERL_ARGS_ASSERT_REG_QR_PACKAGE; PERL_UNUSED_ARG(rx); - return NULL; + if (0) + return NULL; + else + return newSVpvs("Regexp"); } /* Scans the name of a named buffer from the pattern. @@ -5226,9 +5335,12 @@ Perl_reg_qr_package(pTHX_ REGEXP * const rx) #define REG_RSN_RETURN_DATA 2 STATIC SV* -S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) { +S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) +{ char *name_start = RExC_parse; + PERL_ARGS_ASSERT_REG_SCAN_NAME; + if (isIDFIRST_lazy_if(RExC_parse, UTF)) { /* skip IDFIRST by using do...while */ if (UTF) @@ -5242,10 +5354,9 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) { } if ( flags ) { - SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start, - (int)(RExC_parse - name_start))); - if (UTF) - SvUTF8_on(sv_name); + SV* sv_name + = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), + SVs_TEMP | (UTF ? SVf_UTF8 : 0)); if ( flags == REG_RSN_RETURN_NAME) return sv_name; else if (flags==REG_RSN_RETURN_DATA) { @@ -5360,6 +5471,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) char * const oregcomp_parse = RExC_parse; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG; DEBUG_PARSE("reg "); *flagp = 0; /* Tentatively. */ @@ -5549,10 +5662,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) "panic: reg_scan_name returned NULL"); if (!RExC_paren_names) { RExC_paren_names= newHV(); - sv_2mortal((SV*)RExC_paren_names); + sv_2mortal(MUTABLE_SV(RExC_paren_names)); #ifdef DEBUGGING RExC_paren_name_list= newAV(); - sv_2mortal((SV*)RExC_paren_name_list); + sv_2mortal(MUTABLE_SV(RExC_paren_name_list)); #endif } he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 ); @@ -5581,13 +5694,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1); SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); pv[count] = RExC_npar; - SvIVX(sv_dat)++; + SvIV_set(sv_dat, SvIVX(sv_dat) + 1); } } else { (void)SvUPGRADE(sv_dat,SVt_PVNV); sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32)); SvIOK_on(sv_dat); - SvIVX(sv_dat)= 1; + SvIV_set(sv_dat, 1); } #ifdef DEBUGGING if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname))) @@ -5603,6 +5716,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_seen |= REG_SEEN_LOOKBEHIND; RExC_parse++; case '=': /* (?=...) */ + RExC_seen_zerolen++; + break; case '!': /* (?!...) */ RExC_seen_zerolen++; if (*RExC_parse == ')') { @@ -5973,8 +6088,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) break; case KEEPCOPY_PAT_MOD: /* 'p' */ if (flagsp == &negflags) { - if (SIZE_ONLY && ckWARN(WARN_REGEXP)) - vWARN(RExC_parse + 1,"Useless use of (?-p)"); + if (SIZE_ONLY) + ckWARNreg(RExC_parse + 1,"Useless use of (?-p)"); } else { *flagsp |= RXf_PMf_KEEPCOPY; } @@ -6046,6 +6161,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* Pick up the branches, linking them together. */ parse_start = RExC_parse; /* MJD */ br = regbranch(pRExC_state, &flags, 1,depth+1); + + if (freeze_paren) { + if (RExC_npar > after_freeze) + after_freeze = RExC_npar; + RExC_npar = freeze_paren; + } + /* branch_len = (paren != 0); */ if (br == NULL) @@ -6204,6 +6326,9 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) register regnode *latest; I32 flags = 0, c = 0; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGBRANCH; + DEBUG_PARSE("brnc"); if (first) @@ -6279,6 +6404,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) char *parse_start; const char *maxpos = NULL; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGPIECE; + DEBUG_PARSE("piec"); ret = regatom(pRExC_state, &flags,depth+1); @@ -6356,7 +6484,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp = WORST; if (max > 0) *flagp |= HASWIDTH; - if (max && max < min) + if (max < min) vFAIL("Can't do {n,m} with n > m"); if (!SIZE_ONLY) { ARG1_SET(ret, (U16)min); @@ -6416,11 +6544,11 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto do_curly; } nest_check: - if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) { - vWARN3(RExC_parse, - "%.*s matches null string many times", - (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), - origparse); + if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { + ckWARN3reg(RExC_parse, + "%.*s matches null string many times", + (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), + origparse); } if (RExC_parse < RExC_end && *RExC_parse == '?') { @@ -6455,271 +6583,278 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* reg_namedseq(pRExC_state,UVp) This is expected to be called by a parser routine that has - recognized'\N' and needs to handle the rest. RExC_parse is + recognized '\N' and needs to handle the rest. RExC_parse is expected to point at the first char following the N at the time of the call. + + The \N may be inside (indicated by valuep not being NULL) or outside a + character class. + + \N may begin either a named sequence, or if outside a character class, mean + to match a non-newline. For non single-quoted regexes, the tokenizer has + attempted to decide which, and in the case of a named sequence converted it + into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...}, + where c1... are the characters in the sequence. For single-quoted regexes, + the tokenizer passes the \N sequence through unchanged; this code will not + attempt to determine this nor expand those. The net effect is that if the + beginning of the passed-in pattern isn't '{U+' or there is no '}', it + signals that this \N occurrence means to match a non-newline. + + Only the \N{U+...} form should occur in a character class, for the same + reason that '.' inside a character class means to just match a period: it + just doesn't make sense. If valuep is non-null then it is assumed that we are parsing inside of a charclass definition and the first codepoint in the resolved string is returned via *valuep and the routine will return NULL. In this mode if a multichar string is returned from the charnames - handler a warning will be issued, and only the first char in the + handler, a warning will be issued, and only the first char in the sequence will be examined. If the string returned is zero length then the value of *valuep is undefined and NON-NULL will be returned to indicate failure. (This will NOT be a valid pointer to a regnode.) - If value is null then it is assumed that we are parsing normal text - and inserts a new EXACT node into the program containing the resolved - string and returns a pointer to the new node. If the string is - zerolength a NOTHING node is emitted. - + If valuep is null then it is assumed that we are parsing normal text and a + new EXACT node is inserted into the program containing the resolved string, + and a pointer to the new node is returned. But if the string is zero length + a NOTHING node is emitted instead. + On success RExC_parse is set to the char following the endbrace. - Parsing failures will generate a fatal errorvia vFAIL(...) - - NOTE: We cache all results from the charnames handler locally in - the RExC_charnames hash (created on first use) to prevent a charnames - handler from playing silly-buggers and returning a short string and - then a long string for a given pattern. Since the regexp program - size is calculated during an initial parse this would result - in a buffer overrun so we cache to prevent the charname result from - changing during the course of the parse. - + Parsing failures will generate a fatal error via vFAIL(...) */ STATIC regnode * -S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) +S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp) { - char * name; /* start of the content of the name */ - char * endbrace; /* endbrace following the name */ - SV *sv_str = NULL; - SV *sv_name = NULL; - STRLEN len; /* this has various purposes throughout the code */ - bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */ + char * endbrace; /* '}' following the name */ regnode *ret = NULL; - +#ifdef DEBUGGING + char* parse_start = RExC_parse - 2; /* points to the '\N' */ +#endif + char* p; + + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG_NAMEDSEQ; + + GET_RE_DEBUG_FLAGS; + + /* The [^\n] meaning of \N ignores spaces and comments under the /x + * modifier. The other meaning does not */ + p = (RExC_flags & RXf_PMf_EXTENDED) + ? regwhite( pRExC_state, RExC_parse ) + : RExC_parse; + + /* Disambiguate between \N meaning a named character versus \N meaning + * [^\n]. The former is assumed when it can't be the latter. */ + if (*p != '{' || regcurly(p)) { + RExC_parse = p; + if (valuep) { + /* no bare \N in a charclass */ + vFAIL("\\N in a character class must be a named character: \\N{...}"); + } + nextchar(pRExC_state); + ret = reg_node(pRExC_state, REG_ANY); + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + RExC_parse--; + Set_Node_Length(ret, 1); /* MJD */ + return ret; + } + + /* Here, we have decided it should be a named sequence */ + + /* The test above made sure that the next real character is a '{', but + * under the /x modifier, it could be separated by space (or a comment and + * \n) and this is not allowed (for consistency with \x{...} and the + * tokenizer handling of \N{NAME}). */ if (*RExC_parse != '{') { - vFAIL("Missing braces on \\N{}"); + vFAIL("Missing braces on \\N{}"); } - name = RExC_parse+1; - endbrace = strchr(RExC_parse, '}'); - if ( ! endbrace ) { - RExC_parse++; - vFAIL("Missing right brace on \\N{}"); - } - RExC_parse = endbrace + 1; - - - /* RExC_parse points at the beginning brace, - endbrace points at the last */ - if ( name[0]=='U' && name[1]=='+' ) { - /* its a "Unicode hex" notation {U+89AB} */ - I32 fl = PERL_SCAN_ALLOW_UNDERSCORES - | PERL_SCAN_DISALLOW_PREFIX - | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); - UV cp; - char string; - len = (STRLEN)(endbrace - name - 2); - cp = grok_hex(name + 2, &len, &fl, NULL); - if ( len != (STRLEN)(endbrace - name - 2) ) { - cp = 0xFFFD; - } - if (cp > 0xff) - RExC_utf8 = 1; - if ( valuep ) { - *valuep = cp; - return NULL; - } - string = (char)cp; - sv_str= newSVpvn(&string, 1); - } else { - /* fetch the charnames handler for this scope */ - HV * const table = GvHV(PL_hintgv); - SV **cvp= table ? - hv_fetchs(table, "charnames", FALSE) : - NULL; - SV *cv= cvp ? *cvp : NULL; - HE *he_str; - int count; - /* create an SV with the name as argument */ - sv_name = newSVpvn(name, endbrace - name); - - if (!table || !(PL_hints & HINT_LOCALIZE_HH)) { - vFAIL2("Constant(\\N{%s}) unknown: " - "(possibly a missing \"use charnames ...\")", - SvPVX(sv_name)); - } - if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */ - vFAIL2("Constant(\\N{%s}): " - "$^H{charnames} is not defined",SvPVX(sv_name)); - } - - - - if (!RExC_charnames) { - /* make sure our cache is allocated */ - RExC_charnames = newHV(); - sv_2mortal((SV*)RExC_charnames); - } - /* see if we have looked this one up before */ - he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 ); - if ( he_str ) { - sv_str = HeVAL(he_str); - cached = 1; - } else { - dSP ; - ENTER ; - SAVETMPS ; - PUSHMARK(SP) ; - - XPUSHs(sv_name); - - PUTBACK ; - - count= call_sv(cv, G_SCALAR); - - if (count == 1) { /* XXXX is this right? dmq */ - sv_str = POPs; - SvREFCNT_inc_simple_void(sv_str); - } - - SPAGAIN ; - PUTBACK ; - FREETMPS ; - LEAVE ; - - if ( !sv_str || !SvOK(sv_str) ) { - vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} " - "did not return a defined value",SvPVX(sv_name)); - } - if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0)) - cached = 1; - } + RExC_parse++; /* Skip past the '{' */ + + if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ + || ! (endbrace == RExC_parse /* nothing between the {} */ + || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */ + && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */ + { + if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */ + vFAIL("\\N{NAME} must be resolved by the lexer"); } - if (valuep) { - char *p = SvPV(sv_str, len); - if (len) { - STRLEN numlen = 1; - if ( SvUTF8(sv_str) ) { - *valuep = utf8_to_uvchr((U8*)p, &numlen); - if (*valuep > 0x7F) - RExC_utf8 = 1; - /* XXXX - We have to turn on utf8 for high bit chars otherwise - we get failures with - - "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i - "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i - - This is different from what \x{} would do with the same - codepoint, where the condition is > 0xFF. - - dmq - */ - - - } else { - *valuep = (UV)*p; - /* warn if we havent used the whole string? */ - } - if (numlen 0) { - const STRLEN unilen = reguni(pRExC_state, uvc, s); - s += unilen; - len += unilen; - /* In EBCDIC the numlen - * and unilen can differ. */ - foldbuf += numlen; - if (numlen >= foldlen) - break; - } - else - break; /* "Can't happen." */ - } - } else { - const STRLEN unilen = reguni(pRExC_state, uvc, s); - if (unilen > 0) { - s += unilen; - len += unilen; - } - } - } else { - len++; - REGC(*p, s++); - } - } - if (SIZE_ONLY) { - RExC_size += STR_SZ(len); - } else { - STR_LEN(ret) = len; - RExC_emit += STR_SZ(len); - } - Set_Node_Cur_Length(ret); /* MJD */ - RExC_parse--; - nextchar(pRExC_state); - } else { - ret = reg_node(pRExC_state,NOTHING); + + if (endbrace == RExC_parse) { /* empty: \N{} */ + if (! valuep) { + RExC_parse = endbrace + 1; + return reg_node(pRExC_state,NOTHING); + } + + if (SIZE_ONLY) { + ckWARNreg(RExC_parse, + "Ignoring zero length \\N{} in character class" + ); + RExC_parse = endbrace + 1; + } + *valuep = 0; + return (regnode *) &RExC_parse; /* Invalid regnode pointer */ } - if (!cached) { - SvREFCNT_dec(sv_str); + + RExC_utf8 = 1; /* named sequences imply Unicode semantics */ + RExC_parse += 2; /* Skip past the 'U+' */ + + if (valuep) { /* In a bracketed char class */ + /* We only pay attention to the first char of + multichar strings being returned. I kinda wonder + if this makes sense as it does change the behaviour + from earlier versions, OTOH that behaviour was broken + as well. XXX Solution is to recharacterize as + [rest-of-class]|multi1|multi2... */ + + STRLEN length_of_hex; + I32 flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX + | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); + + char * endchar = strchr(RExC_parse, '.'); + if (endchar) { + ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); + } + else endchar = endbrace; + + length_of_hex = (STRLEN)(endchar - RExC_parse); + *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL); + + /* The tokenizer should have guaranteed validity, but it's possible to + * bypass it by using single quoting, so check */ + if (length_of_hex == 0 + || length_of_hex != (STRLEN)(endchar - RExC_parse) ) + { + RExC_parse += length_of_hex; /* Includes all the valid */ + RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ + ? UTF8SKIP(RExC_parse) + : 1; + /* Guard against malformed utf8 */ + if (RExC_parse >= endchar) RExC_parse = endchar; + vFAIL("Invalid hexadecimal number in \\N{U+...}"); + } + + RExC_parse = endbrace + 1; + if (endchar == endbrace) return NULL; + + ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */ } - if (sv_name) { - SvREFCNT_dec(sv_name); + else { /* Not a char class */ + char *s; /* String to put in generated EXACT node */ + STRLEN len = 0; /* Its current length */ + char *endchar; /* Points to '.' or '}' ending cur char in the input + stream */ + + ret = reg_node(pRExC_state, + (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT)); + s= STRING(ret); + + /* Exact nodes can hold only a U8 length's of text = 255. Loop through + * the input which is of the form now 'c1.c2.c3...}' until find the + * ending brace or exeed length 255. The characters that exceed this + * limit are dropped. The limit could be relaxed should it become + * desirable by reparsing this as (?:\N{NAME}), so could generate + * multiple EXACT nodes, as is done for just regular input. But this + * is primarily a named character, and not intended to be a huge long + * string, so 255 bytes should be good enough */ + while (1) { + STRLEN length_of_hex; + I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX + | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); + UV cp; /* Ord of current character */ + + /* Code points are separated by dots. If none, there is only one + * code point, and is terminated by the brace */ + endchar = strchr(RExC_parse, '.'); + if (! endchar) endchar = endbrace; + + /* The values are Unicode even on EBCDIC machines */ + length_of_hex = (STRLEN)(endchar - RExC_parse); + cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL); + if ( length_of_hex == 0 + || length_of_hex != (STRLEN)(endchar - RExC_parse) ) + { + RExC_parse += length_of_hex; /* Includes all the valid */ + RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ + ? UTF8SKIP(RExC_parse) + : 1; + /* Guard against malformed utf8 */ + if (RExC_parse >= endchar) RExC_parse = endchar; + vFAIL("Invalid hexadecimal number in \\N{U+...}"); + } + + if (! FOLD) { /* Not folding, just append to the string */ + STRLEN unilen; + + /* Quit before adding this character if would exceed limit */ + if (len + UNISKIP(cp) > U8_MAX) break; + + unilen = reguni(pRExC_state, cp, s); + if (unilen > 0) { + s += unilen; + len += unilen; + } + } else { /* Folding, output the folded equivalent */ + STRLEN foldlen,numlen; + U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; + cp = toFOLD_uni(cp, tmpbuf, &foldlen); + + /* Quit before exceeding size limit */ + if (len + foldlen > U8_MAX) break; + + for (foldbuf = tmpbuf; + foldlen; + foldlen -= numlen) + { + cp = utf8_to_uvchr(foldbuf, &numlen); + if (numlen > 0) { + const STRLEN unilen = reguni(pRExC_state, cp, s); + s += unilen; + len += unilen; + /* In EBCDIC the numlen and unilen can differ. */ + foldbuf += numlen; + if (numlen >= foldlen) + break; + } + else + break; /* "Can't happen." */ + } + } + + /* Point to the beginning of the next character in the sequence. */ + RExC_parse = endchar + 1; + + /* Quit if no more characters */ + if (RExC_parse >= endbrace) break; + } + + + if (SIZE_ONLY) { + if (RExC_parse < endbrace) { + ckWARNreg(RExC_parse - 1, + "Using just the first characters returned by \\N{}"); + } + + RExC_size += STR_SZ(len); + } else { + STR_LEN(ret) = len; + RExC_emit += STR_SZ(len); + } + + RExC_parse = endbrace + 1; + + *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail + with malformed in t/re/pat_advanced.t */ + RExC_parse --; + Set_Node_Cur_Length(ret); /* MJD */ + nextchar(pRExC_state); } - return ret; + return ret; } @@ -6737,11 +6872,13 @@ STATIC UV S_reg_recode(pTHX_ const char value, SV **encp) { STRLEN numlen = 1; - SV * const sv = sv_2mortal(newSVpvn(&value, numlen)); + SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP); const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); const STRLEN newlen = SvCUR(sv); UV uv = UNICODE_REPLACEMENT; + PERL_ARGS_ASSERT_REG_RECODE; + if (newlen) uv = SvUTF8(sv) ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT) @@ -6788,6 +6925,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) DEBUG_PARSE("atom"); *flagp = WORST; /* Tentatively. */ + PERL_ARGS_ASSERT_REGATOM; tryagain: switch ((U8)*RExC_parse) { @@ -7046,12 +7184,12 @@ tryagain: } break; case 'N': - /* Handle \N{NAME} here and not below because it can be + /* Handle \N and \N{NAME} here and not below because it can be multicharacter. join_exact() will join them up later on. Also this makes sure that things like /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq*/ ++RExC_parse; - ret= reg_namedseq(pRExC_state, NULL); + ret= reg_namedseq(pRExC_state, NULL, flagp); break; case 'k': /* Handle \k and \k'NAME' */ parse_named_seq: @@ -7317,6 +7455,18 @@ tryagain: I32 flags = 0; STRLEN numlen = 3; ender = grok_oct(p, &numlen, &flags, NULL); + + /* An octal above 0xff is interpreted differently + * depending on if the re is in utf8 or not. If it + * is in utf8, the value will be itself, otherwise + * it is interpreted as modulo 0x100. It has been + * decided to discourage the use of octal above the + * single-byte range. For now, warn only when + * it ends up modulo */ + if (SIZE_ONLY && ender >= 0x100 + && ! UTF && ! PL_encoding) { + ckWARNregdep(p, "Use of octal value above 377 is deprecated"); + } p += numlen; } else { @@ -7330,8 +7480,8 @@ tryagain: { SV* enc = PL_encoding; ender = reg_recode((const char)(U8)ender, &enc); - if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP)) - vWARN(p, "Invalid escape in the specified encoding"); + if (!enc && SIZE_ONLY) + ckWARNreg(p, "Invalid escape in the specified encoding"); RExC_utf8 = 1; } break; @@ -7340,8 +7490,8 @@ tryagain: FAIL("Trailing \\"); /* FALL THROUGH */ default: - if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP)) - vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p)); + if (!SIZE_ONLY&& isALPHA(*p)) + ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p)); goto normal_default; } break; @@ -7468,6 +7618,9 @@ STATIC char * S_regwhite( RExC_state_t *pRExC_state, char *p ) { const char *e = RExC_end; + + PERL_ARGS_ASSERT_REGWHITE; + while (p < e) { if (isSPACE(*p)) ++p; @@ -7504,6 +7657,8 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) dVAR; I32 namedclass = OOB_NAMEDCLASS; + PERL_ARGS_ASSERT_REGPPOSIXCC; + if (value == '[' && RExC_parse + 1 < RExC_end && /* I smell either [: or [= or [. -- POSIX has been here, right? */ POSIXCC(UCHARAT(RExC_parse))) { @@ -7618,6 +7773,9 @@ STATIC void S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) { dVAR; + + PERL_ARGS_ASSERT_CHECKPOSIXCC; + if (POSIXCC(UCHARAT(RExC_parse))) { const char *s = RExC_parse; const char c = *s++; @@ -7625,10 +7783,9 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) while (isALNUM(*s)) s++; if (*s && c == *s && s[1] == ']') { - if (ckWARN(WARN_REGEXP)) - vWARN3(s+2, - "POSIX syntax [%c %c] belongs inside character classes", - c, c); + ckWARN3reg(s+2, + "POSIX syntax [%c %c] belongs inside character classes", + c, c); /* [[=foo=]] and [[.foo.]] are still future. */ if (POSIXCC_NOTYET(c)) { @@ -7683,6 +7840,22 @@ case ANYOF_N##NAME: \ what = WORD; \ break +/* + We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test + so that it is possible to override the option here without having to + rebuild the entire core. as we are required to do if we change regcomp.h + which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined. +*/ +#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS +#define BROKEN_UNICODE_CHARCLASS_MAPPINGS +#endif + +#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS +#define POSIX_CC_UNI_NAME(CCNAME) CCNAME +#else +#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME +#endif + /* parse a class specification and produce either an ANYOF node that matches the pattern or if the pattern matches a single char only and @@ -7716,6 +7889,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) case we need to change the emitted regop to an EXACT. */ const char * orig_parse = RExC_parse; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGCLASS; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif @@ -7812,7 +7987,7 @@ parseit: from earlier versions, OTOH that behaviour was broken as well. */ UV v; /* value is register so we cant & it /grrr */ - if (reg_namedseq(pRExC_state, &v)) { + if (reg_namedseq(pRExC_state, &v, NULL)) { goto parseit; } value= v; @@ -7906,16 +8081,16 @@ parseit: { SV* enc = PL_encoding; value = reg_recode((const char)(U8)value, &enc); - if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP)) - vWARN(RExC_parse, - "Invalid escape in the specified encoding"); + if (!enc && SIZE_ONLY) + ckWARNreg(RExC_parse, + "Invalid escape in the specified encoding"); break; } default: - if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP)) - vWARN2(RExC_parse, - "Unrecognized escape \\%c in character class passed through", - (int)value); + if (!SIZE_ONLY && isALPHA(value)) + ckWARN2reg(RExC_parse, + "Unrecognized escape \\%c in character class passed through", + (int)value); break; } } /* end of \blah */ @@ -7934,14 +8109,13 @@ parseit: /* a bad range like a-\d, a-[:digit:] ? */ if (range) { if (!SIZE_ONLY) { - if (ckWARN(WARN_REGEXP)) { - const int w = - RExC_parse >= rangebegin ? - RExC_parse - rangebegin : 0; - vWARN4(RExC_parse, + const int w = + RExC_parse >= rangebegin ? + RExC_parse - rangebegin : 0; + ckWARN4reg(RExC_parse, "False [] range \"%*.*s\"", w, w, rangebegin); - } + if (prevvalue < 256) { ANYOF_BITMAP_SET(ret, prevvalue); ANYOF_BITMAP_SET(ret, '-'); @@ -7969,18 +8143,24 @@ parseit: * A similar issue a little earlier when switching on value. * --jhi */ switch ((I32)namedclass) { + + case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum")); + case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha")); + case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank")); + case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl")); + case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph")); + case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower")); + case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print")); + case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space")); + case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct")); + case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper")); +#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS case _C_C_T_(ALNUM, isALNUM(value), "Word"); - case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum"); - case _C_C_T_(ALPHA, isALPHA(value), "Alpha"); - case _C_C_T_(BLANK, isBLANK(value), "Blank"); - case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl"); - case _C_C_T_(GRAPH, isGRAPH(value), "Graph"); - case _C_C_T_(LOWER, isLOWER(value), "Lower"); - case _C_C_T_(PRINT, isPRINT(value), "Print"); - case _C_C_T_(PSXSPC, isPSXSPC(value), "Space"); - case _C_C_T_(PUNCT, isPUNCT(value), "Punct"); case _C_C_T_(SPACE, isSPACE(value), "SpacePerl"); - case _C_C_T_(UPPER, isUPPER(value), "Upper"); +#else + case _C_C_T_(SPACE, isSPACE(value), "PerlSpace"); + case _C_C_T_(ALNUM, isALNUM(value), "PerlWord"); +#endif case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit"); case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); @@ -8027,7 +8207,7 @@ parseit: ANYOF_BITMAP_SET(ret, value); } yesno = '+'; - what = "Digit"; + what = POSIX_CC_UNI_NAME("Digit"); break; case ANYOF_NDIGIT: if (LOC) @@ -8040,7 +8220,7 @@ parseit: ANYOF_BITMAP_SET(ret, value); } yesno = '!'; - what = "Digit"; + what = POSIX_CC_UNI_NAME("Digit"); break; case ANYOF_MAX: /* this is to handle \p and \P */ @@ -8179,8 +8359,8 @@ parseit: if (!unicode_alternate) unicode_alternate = newAV(); - sv = newSVpvn((char*)foldbuf, foldlen); - SvUTF8_on(sv); + sv = newSVpvn_utf8((char*)foldbuf, foldlen, + TRUE); av_push(unicode_alternate, sv); } } @@ -8240,6 +8420,7 @@ parseit: *STRING(ret)= (char)value; STR_LEN(ret)= 1; RExC_emit += STR_SZ(1); + SvREFCNT_dec(listsv); return ret; } /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */ @@ -8276,8 +8457,8 @@ parseit: * used later (regexec.c:S_reginclass()). */ av_store(av, 0, listsv); av_store(av, 1, NULL); - av_store(av, 2, (SV*)unicode_alternate); - rv = newRV_noinc((SV*)av); + av_store(av, 2, MUTABLE_SV(unicode_alternate)); + rv = newRV_noinc(MUTABLE_SV(av)); n = add_data(pRExC_state, 1, "s"); RExC_rxi->data->data[n] = (void*)rv; ARG_SET(ret, n); @@ -8303,6 +8484,9 @@ STATIC bool S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) { bool ended = 0; + + PERL_ARGS_ASSERT_REG_SKIPCOMMENT; + while (RExC_parse < RExC_end) if (*RExC_parse++ == '\n') { ended = 1; @@ -8335,6 +8519,8 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) { char* const retval = RExC_parse++; + PERL_ARGS_ASSERT_NEXTCHAR; + for (;;) { if (*RExC_parse == '(' && RExC_parse[1] == '?' && RExC_parse[2] == '#') { @@ -8371,6 +8557,8 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) regnode * const ret = RExC_emit; GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_REG_NODE; + if (SIZE_ONLY) { SIZE_ALIGN(RExC_size); RExC_size += 1; @@ -8410,6 +8598,8 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) regnode * const ret = RExC_emit; GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_REGANODE; + if (SIZE_ONLY) { SIZE_ALIGN(RExC_size); RExC_size += 2; @@ -8460,6 +8650,9 @@ STATIC STRLEN S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) { dVAR; + + PERL_ARGS_ASSERT_REGUNI; + return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s); } @@ -8478,6 +8671,8 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) const int offset = regarglen[(U8)op]; const int size = NODE_STEP_REGNODE + offset; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGINSERT; PERL_UNUSED_ARG(depth); /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]); @@ -8560,6 +8755,8 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 de dVAR; register regnode *scan; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGTAIL; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif @@ -8620,9 +8817,10 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, #ifdef EXPERIMENTAL_INPLACESCAN I32 min = 0; #endif - GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_REGTAIL_STUDY; + if (SIZE_ONLY) return exact; @@ -8689,9 +8887,12 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, /* - regcurly - a little FSA that accepts {\d+,?\d*} */ -STATIC I32 -S_regcurly(register const char *s) +#ifndef PERL_IN_XSUB_RE +I32 +Perl_regcurly(register const char *s) { + PERL_ARGS_ASSERT_REGCURLY; + if (*s++ != '{') return FALSE; if (!isDIGIT(*s)) @@ -8706,16 +8907,18 @@ S_regcurly(register const char *s) return FALSE; return TRUE; } - +#endif /* - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ #ifdef DEBUGGING -void -S_regdump_extflags(pTHX_ const char *lead, const U32 flags) { +static void +S_regdump_extflags(pTHX_ const char *lead, const U32 flags) +{ int bit; int set=0; + for (bit=0; bit<32; bit++) { if (flags & (1<program, ri->program + 1, NULL, NULL, sv, 0, 0); /* Header fields of interest. */ @@ -8816,6 +9021,7 @@ Perl_regdump(pTHX_ const regexp *r) PerlIO_printf(Perl_debug_log, "\n"); DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags)); #else + PERL_ARGS_ASSERT_REGDUMP; PERL_UNUSED_CONTEXT; PERL_UNUSED_ARG(r); #endif /* DEBUGGING */ @@ -8824,6 +9030,17 @@ Perl_regdump(pTHX_ const regexp *r) /* - regprop - printable representation of opcode */ +#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \ +STMT_START { \ + if (do_sep) { \ + Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \ + if (flags & ANYOF_INVERT) \ + /*make sure the invert info is in each */ \ + sv_catpvs(sv, "^"); \ + do_sep = 0; \ + } \ +} STMT_END + void Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) { @@ -8833,8 +9050,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_REGPROP; - sv_setpvn(sv, "", 0); + sv_setpvs(sv, ""); if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ /* It would be nice to FAIL() here, but this may be called from @@ -8913,16 +9131,16 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ - if ( prog->paren_names ) { + if ( RXp_PAREN_NAMES(prog) ) { if ( k != REF || OP(o) < NREF) { - AV *list= (AV *)progi->data->data[progi->name_list_idx]; + AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); SV **name= av_fetch(list, ARG(o), 0 ); if (name) Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); } else { - AV *list= (AV *)progi->data->data[ progi->name_list_idx ]; - SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ]; + AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]); + SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]); I32 *nums=(I32*)SvPVX(sv_dat); SV **name= av_fetch(list, nums[0], 0 ); I32 n; @@ -8940,7 +9158,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) else if (k == VERB) { if (!o->flags) Perl_sv_catpvf(aTHX_ sv, ":%"SVf, - SVfARG((SV*)progi->data->data[ ARG( o ) ])); + SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); } else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ else if (k == FOLDCHAR) @@ -8948,6 +9166,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) else if (k == ANYOF) { int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o); + int do_sep = 0; /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ static const char * const anyofs[] = { @@ -8963,8 +9182,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) "[:^alpha:]", "[:ascii:]", "[:^ascii:]", - "[:ctrl:]", - "[:^ctrl:]", + "[:cntrl:]", + "[:^cntrl:]", "[:graph:]", "[:^graph:]", "[:lower:]", @@ -8990,6 +9209,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); if (flags & ANYOF_INVERT) sv_catpvs(sv, "^"); + + /* output what the standard cp 0-255 bitmap matches */ for (i = 0; i <= 256; i++) { if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { if (rangestart == -1) @@ -9003,15 +9224,23 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) sv_catpvs(sv, "-"); put_byte(sv, i - 1); } + do_sep = 1; rangestart = -1; } } - + + EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); + /* output any special charclass tests (used mostly under use locale) */ if (o->flags & ANYOF_CLASS) for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) - if (ANYOF_CLASS_TEST(o,i)) + if (ANYOF_CLASS_TEST(o,i)) { sv_catpv(sv, anyofs[i]); - + do_sep = 1; + } + + EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); + + /* output information about the unicode matching */ if (flags & ANYOF_UNICODE) sv_catpvs(sv, "{unicode}"); else if (flags & ANYOF_UNICODE_ALL) @@ -9024,7 +9253,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) if (lv) { if (sw) { U8 s[UTF8_MAXBYTES_CASE+1]; - + for (i = 0; i <= 256; i++) { /* just the first 256 */ uvchr_to_utf8(s, i); @@ -9095,10 +9324,13 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } SV * -Perl_re_intuit_string(pTHX_ REGEXP * const prog) +Perl_re_intuit_string(pTHX_ REGEXP * const r) { /* Assume that RE_INTUIT is set */ dVAR; + struct regexp *const prog = (struct regexp *)SvANY(r); GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_INTUIT_STRING; PERL_UNUSED_CONTEXT; DEBUG_COMPILE_r( @@ -9136,38 +9368,36 @@ Perl_re_intuit_string(pTHX_ REGEXP * const prog) void Perl_pregfree(pTHX_ REGEXP *r) { + SvREFCNT_dec(r); +} + +void +Perl_pregfree2(pTHX_ REGEXP *rx) +{ dVAR; + struct regexp *const r = (struct regexp *)SvANY(rx); GET_RE_DEBUG_FLAGS_DECL; - if (!r || (--r->refcnt > 0)) - return; + PERL_ARGS_ASSERT_PREGFREE2; + if (r->mother_re) { ReREFCNT_dec(r->mother_re); } else { - CALLREGFREE_PVT(r); /* free the private data */ - if (r->paren_names) - SvREFCNT_dec(r->paren_names); - Safefree(RXp_WRAPPED(r)); + CALLREGFREE_PVT(rx); /* free the private data */ + SvREFCNT_dec(RXp_PAREN_NAMES(r)); } if (r->substrs) { - if (r->anchored_substr) - SvREFCNT_dec(r->anchored_substr); - if (r->anchored_utf8) - SvREFCNT_dec(r->anchored_utf8); - if (r->float_substr) - SvREFCNT_dec(r->float_substr); - if (r->float_utf8) - SvREFCNT_dec(r->float_utf8); + SvREFCNT_dec(r->anchored_substr); + SvREFCNT_dec(r->anchored_utf8); + SvREFCNT_dec(r->float_substr); + SvREFCNT_dec(r->float_utf8); Safefree(r->substrs); } - RX_MATCH_COPY_FREE(r); + RX_MATCH_COPY_FREE(rx); #ifdef PERL_OLD_COPY_ON_WRITE - if (r->saved_copy) - SvREFCNT_dec(r->saved_copy); + SvREFCNT_dec(r->saved_copy); #endif - Safefree(r->swap); Safefree(r->offs); - Safefree(r); } /* reg_temp_copy() @@ -9188,15 +9418,32 @@ Perl_pregfree(pTHX_ REGEXP *r) REGEXP * -Perl_reg_temp_copy (pTHX_ REGEXP *r) { - regexp *ret; +Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) +{ + struct regexp *ret; + struct regexp *const r = (struct regexp *)SvANY(rx); register const I32 npar = r->nparens+1; - (void)ReREFCNT_inc(r); - Newx(ret, 1, regexp); - StructCopy(r, ret, regexp); + + PERL_ARGS_ASSERT_REG_TEMP_COPY; + + if (!ret_x) + ret_x = (REGEXP*) newSV_type(SVt_REGEXP); + ret = (struct regexp *)SvANY(ret_x); + + (void)ReREFCNT_inc(rx); + /* We can take advantage of the existing "copied buffer" mechanism in SVs + by pointing directly at the buffer, but flagging that the allocated + space in the copy is zero. As we've just done a struct copy, it's now + a case of zero-ing that, rather than copying the current length. */ + SvPV_set(ret_x, RX_WRAPPED(rx)); + SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8); + memcpy(&(ret->xpv_cur), &(r->xpv_cur), + sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur)); + SvLEN_set(ret_x, 0); + SvSTASH_set(ret_x, NULL); + SvMAGIC_set(ret_x, NULL); Newx(ret->offs, npar, regexp_paren_pair); Copy(r->offs, ret->offs, npar, regexp_paren_pair); - ret->refcnt = 1; if (r->substrs) { Newx(ret->substrs, 1, struct reg_substr_data); StructCopy(r->substrs, ret->substrs, struct reg_substr_data); @@ -9209,14 +9456,13 @@ Perl_reg_temp_copy (pTHX_ REGEXP *r) { /* check_substr and check_utf8, if non-NULL, point to either their anchored or float namesakes, and don't hold a second reference. */ } - RX_MATCH_COPIED_off(ret); + RX_MATCH_COPIED_off(ret_x); #ifdef PERL_OLD_COPY_ON_WRITE ret->saved_copy = NULL; #endif - ret->mother_re = r; - ret->swap = NULL; + ret->mother_re = rx; - return ret; + return ret_x; } #endif @@ -9233,19 +9479,22 @@ Perl_reg_temp_copy (pTHX_ REGEXP *r) { */ void -Perl_regfree_internal(pTHX_ REGEXP * const r) +Perl_regfree_internal(pTHX_ REGEXP * const rx) { dVAR; + struct regexp *const r = (struct regexp *)SvANY(rx); RXi_GET_DECL(r,ri); GET_RE_DEBUG_FLAGS_DECL; - + + PERL_ARGS_ASSERT_REGFREE_INTERNAL; + DEBUG_COMPILE_r({ if (!PL_colorset) reginitcolors(); { SV *dsv= sv_newmortal(); - RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8), - dsv, RXp_PRECOMP(r), RXp_PRELEN(r), 60); + RE_PV_QUOTED_DECL(s, RX_UTF8(rx), + dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60); PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", PL_colors[4],PL_colors[5],s); } @@ -9266,13 +9515,13 @@ Perl_regfree_internal(pTHX_ REGEXP * const r) case 's': case 'S': case 'u': - SvREFCNT_dec((SV*)ri->data->data[n]); + SvREFCNT_dec(MUTABLE_SV(ri->data->data[n])); break; case 'f': Safefree(ri->data->data[n]); break; case 'p': - new_comppad = (AV*)ri->data->data[n]; + new_comppad = MUTABLE_AV(ri->data->data[n]); break; case 'o': if (new_comppad == NULL) @@ -9288,7 +9537,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const r) op_free((OP_4tree*)ri->data->data[n]); PAD_RESTORE_LOCAL(old_comppad); - SvREFCNT_dec((SV*)new_comppad); + SvREFCNT_dec(MUTABLE_SV(new_comppad)); new_comppad = NULL; break; case 'n': @@ -9347,15 +9596,15 @@ Perl_regfree_internal(pTHX_ REGEXP * const r) } #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t)) -#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t)) -#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t)) +#define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t))) +#define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t))) #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) /* re_dup - duplicate a regexp. - This routine is expected to clone a given regexp structure. It is not - compiler under USE_ITHREADS. + This routine is expected to clone a given regexp structure. It is only + compiled under USE_ITHREADS. After all of the core data stored in struct regexp is duplicated the regexp_engine.dupe method is used to copy any private data @@ -9366,23 +9615,17 @@ Perl_regfree_internal(pTHX_ REGEXP * const r) */ #if defined(USE_ITHREADS) #ifndef PERL_IN_XSUB_RE -regexp * -Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) +void +Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) { dVAR; - regexp *ret; I32 npar; - - if (!r) - return (REGEXP *)NULL; - - if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r))) - return ret; - + const struct regexp *r = (const struct regexp *)SvANY(sstr); + struct regexp *ret = (struct regexp *)SvANY(dstr); + PERL_ARGS_ASSERT_RE_DUP_GUTS; + npar = r->nparens+1; - Newx(ret, 1, regexp); - StructCopy(r, ret, regexp); Newx(ret->offs, npar, regexp_paren_pair); Copy(r->offs, ret->offs, npar, regexp_paren_pair); if(ret->swap) { @@ -9394,7 +9637,9 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) /* Do it this way to avoid reading from *r after the StructCopy(). That way, if any of the sv_dup_inc()s dislodge *r from the L1 cache, it doesn't matter. */ - const bool anchored = r->check_substr == r->anchored_substr; + const bool anchored = r->check_substr + ? r->check_substr == r->anchored_substr + : r->check_utf8 == r->anchored_utf8; Newx(ret->substrs, 1, struct reg_substr_data); StructCopy(r->substrs, ret->substrs, struct reg_substr_data); @@ -9417,16 +9662,21 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) ret->check_substr = ret->float_substr; ret->check_utf8 = ret->float_utf8; } + } else if (ret->check_utf8) { + if (anchored) { + ret->check_utf8 = ret->anchored_utf8; + } else { + ret->check_utf8 = ret->float_utf8; + } } } - RXp_WRAPPED(ret) = SAVEPVN(RXp_WRAPPED(ret), RXp_WRAPLEN(ret)+1); - ret->paren_names = hv_dup_inc(ret->paren_names, param); + RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); if (ret->pprivate) - RXi_SET(ret,CALLREGDUPE_PVT(ret,param)); + RXi_SET(ret,CALLREGDUPE_PVT(dstr,param)); - if (RX_MATCH_COPIED(ret)) + if (RX_MATCH_COPIED(dstr)) ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); else ret->subbeg = NULL; @@ -9434,12 +9684,21 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) ret->saved_copy = NULL; #endif - ret->mother_re = NULL; + if (ret->mother_re) { + if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) { + /* Our storage points directly to our mother regexp, but that's + 1: a buffer in a different thread + 2: something we no longer hold a reference on + so we need to copy it locally. */ + /* Note we need to sue SvCUR() on our mother_re, because it, in + turn, may well be pointing to its own mother_re. */ + SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re), + SvCUR(ret->mother_re)+1)); + SvLEN_set(dstr, SvCUR(ret->mother_re)+1); + } + ret->mother_re = NULL; + } ret->gofs = 0; - ret->seen_evals = 0; - - ptr_table_store(PL_ptr_table, r, ret); - return ret; } #endif /* PERL_IN_XSUB_RE */ @@ -9458,17 +9717,20 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) */ void * -Perl_regdupe_internal(pTHX_ REGEXP * const r, CLONE_PARAMS *param) +Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) { dVAR; + struct regexp *const r = (struct regexp *)SvANY(rx); regexp_internal *reti; int len, npar; RXi_GET_DECL(r,ri); + + PERL_ARGS_ASSERT_REGDUPE_INTERNAL; npar = r->nparens+1; len = ProgLen(ri); - Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal); + Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal); Copy(ri->program, reti->program, len+1, regnode); @@ -9493,7 +9755,7 @@ Perl_regdupe_internal(pTHX_ REGEXP * const r, CLONE_PARAMS *param) case 'S': case 'p': /* actually an AV, but the dup function is identical. */ case 'u': /* actually an HV, but the dup function is identical. */ - d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param); + d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); break; case 'f': /* This is cheating. */ @@ -9550,48 +9812,8 @@ Perl_regdupe_internal(pTHX_ REGEXP * const r, CLONE_PARAMS *param) #endif /* USE_ITHREADS */ -/* - reg_stringify() - - converts a regexp embedded in a MAGIC struct to its stringified form, - caching the converted form in the struct and returns the cached - string. - - If lp is nonnull then it is used to return the length of the - resulting string - - If flags is nonnull and the returned string contains UTF8 then - (*flags & 1) will be true. - - If haseval is nonnull then it is used to return whether the pattern - contains evals. - - Normally called via macro: - - CALLREG_STRINGIFY(mg,&len,&utf8); - - And internally with - - CALLREG_AS_STR(mg,&lp,&flags,&haseval) - - See sv_2pv_flags() in sv.c for an example of internal usage. - - */ #ifndef PERL_IN_XSUB_RE -char * -Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) { - dVAR; - const REGEXP * const re = (REGEXP *)mg->mg_obj; - if (haseval) - *haseval = RX_SEEN_EVALS(re); - if (flags) - *flags = ((RX_EXTFLAGS(re) & RXf_UTF8) ? 1 : 0); - if (lp) - *lp = RX_WRAPLEN(re); - return RX_WRAPPED(re); -} - /* - regnext - dig the "next" pointer out of a node */ @@ -9622,6 +9844,8 @@ S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) SV *msv; const char *message; + PERL_ARGS_ASSERT_RE_CROAK2; + if (l1 > 510) l1 = 510; if (l1 + l2 > 510) @@ -9711,6 +9935,8 @@ clear_re(pTHX_ void *r) STATIC void S_put_byte(pTHX_ SV *sv, int c) { + PERL_ARGS_ASSERT_PUT_BYTE; + /* Our definition of isPRINT() ignores locales, so only bytes that are not part of UTF-8 are considered printable. I assume that the same holds for UTF-EBCDIC. @@ -9752,7 +9978,9 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, RXi_GET_DECL(r,ri); GET_RE_DEBUG_FLAGS_DECL; - + + PERL_ARGS_ASSERT_DUMPUNTIL; + #ifdef DEBUG_DUMPUNTIL PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start, last ? last-start : 0,plast ? plast-start : 0); @@ -9818,11 +10046,11 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, const reg_trie_data * const trie = (reg_trie_data*)ri->data->data[optrie]; #ifdef DEBUGGING - AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET]; + AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); #endif const regnode *nextbranch= NULL; I32 word_idx; - sv_setpvn(sv, "", 0); + sv_setpvs(sv, ""); for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);