X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/e40c711d0c25d143bda13fc8ce188050d8d2ccf6..a4b8f4b4b34c2038e6b9fbd77849f6f99e527be0:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 04e2228..2f72faf 100644 --- a/regcomp.c +++ b/regcomp.c @@ -139,23 +139,31 @@ struct RExC_state_t { corresponding to copy_start */ SSize_t whilem_seen; /* number of WHILEM in this expr */ regnode *emit_start; /* Start of emitted-code area */ - regnode *emit_bound; /* First regnode outside of the - allocated space */ regnode_offset emit; /* Code-emit pointer */ I32 naughty; /* How bad is this pattern? */ I32 sawback; /* Did we see \1, ...? */ U32 seen; SSize_t size; /* Number of regnode equivalents in pattern */ - I32 npar; /* Capture buffer count, (OPEN) plus - one. ("par" 0 is the whole - pattern)*/ - I32 total_par; /* Capture buffer count after parse - completed, (OPEN) plus one. ("par" 0 - is the whole pattern)*/ + + /* position beyond 'precomp' of the warning message furthest away from + * 'precomp'. During the parse, no warnings are raised for any problems + * earlier in the parse than this position. This works if warnings are + * raised the first time a given spot is parsed, and if only one + * independent warning is raised for any given spot */ + Size_t latest_warn_offset; + + I32 npar; /* Capture buffer count so far in the + parse, (OPEN) plus one. ("par" 0 is + the whole pattern)*/ + I32 total_par; /* During initial parse, is either 0, + or -1; the latter indicating a + reparse is needed. After that pass, + it is what 'npar' became after the + pass. Hence, it being > 0 indicates + we are in a reparse situation */ I32 nestroot; /* root parens we are in - used by accept */ - I32 extralen; I32 seen_zerolen; regnode_offset *open_parens; /* offsets to open parens */ regnode_offset *close_parens; /* offsets to close parens */ @@ -215,7 +223,7 @@ struct RExC_state_t { bool strict; bool study_started; bool in_script_run; - bool pass1; + bool use_BRANCHJ; }; #define RExC_flags (pRExC_state->flags) @@ -230,6 +238,7 @@ struct RExC_state_t { #define RExC_start (pRExC_state->start) #define RExC_end (pRExC_state->end) #define RExC_parse (pRExC_state->parse) +#define RExC_latest_warn_offset (pRExC_state->latest_warn_offset ) #define RExC_whilem_seen (pRExC_state->whilem_seen) /* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any @@ -247,9 +256,7 @@ struct RExC_state_t { others */ #endif #define RExC_emit (pRExC_state->emit) -#define RExC_pass1 (pRExC_state->pass1) #define RExC_emit_start (pRExC_state->emit_start) -#define RExC_emit_bound (pRExC_state->emit_bound) #define RExC_sawback (pRExC_state->sawback) #define RExC_seen (pRExC_state->seen) #define RExC_size (pRExC_state->size) @@ -257,7 +264,6 @@ struct RExC_state_t { #define RExC_npar (pRExC_state->npar) #define RExC_total_parens (pRExC_state->total_par) #define RExC_nestroot (pRExC_state->nestroot) -#define RExC_extralen (pRExC_state->extralen) #define RExC_seen_zerolen (pRExC_state->seen_zerolen) #define RExC_utf8 (pRExC_state->utf8) #define RExC_uni_semantics (pRExC_state->uni_semantics) @@ -284,7 +290,7 @@ struct RExC_state_t { #define RExC_study_started (pRExC_state->study_started) #define RExC_warn_text (pRExC_state->warn_text) #define RExC_in_script_run (pRExC_state->in_script_run) -#define RExC_use_BRANCHJ (!SIZE_ONLY && RExC_extralen) +#define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ) /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set * a flag to disable back-off on the fixed/floating substrings - if it's @@ -341,7 +347,6 @@ struct RExC_state_t { #define REQUIRE_UTF8(flagp) STMT_START { \ if (!UTF) { \ - assert(PASS1); \ *flagp = RESTART_PARSE|NEED_UTF8; \ return 0; \ } \ @@ -354,16 +359,26 @@ struct RExC_state_t { #define REQUIRE_UNI_RULES(flagp, restart_retval) \ STMT_START { \ if (DEPENDS_SEMANTICS) { \ - assert(PASS1); \ set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \ RExC_uni_semantics = 1; \ - if (RExC_seen_unfolded_sharp_s) { \ *flagp |= RESTART_PARSE; \ return restart_retval; \ - } \ } \ } STMT_END +#define BRANCH_MAX_OFFSET U16_MAX +#define REQUIRE_BRANCHJ(flagp, restart_retval) \ + STMT_START { \ + RExC_use_BRANCHJ = 1; \ + *flagp |= RESTART_PARSE; \ + return restart_retval; \ + } STMT_END + +#define REQUIRE_PARENS_PASS \ + STMT_START { \ + if (RExC_total_parens == 0) RExC_total_parens = -1; \ + } STMT_END + /* Executes a return statement with the value 'X', if 'flags' contains any of * 'RESTART_PARSE', 'NEED_UTF8', or 'extra'. If so, *flagp is set to those * flags */ @@ -671,7 +686,7 @@ static const scan_data_t zero_scan_data = { #define REPORT_LOCATION_ARGS(xC) \ UTF8fARG(UTF, \ (xI(xC) > eI) /* Don't run off end */ \ - ? eC - sC /* Length before the <--HERE */ \ + ? eI - sI /* Length before the <--HERE */ \ : ((xI_offset(xC) >= 0) \ ? xI_offset(xC) \ : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \ @@ -693,6 +708,10 @@ static const scan_data_t zero_scan_data = { STMT_START { \ if (RExC_rx_sv) \ SAVEFREESV(RExC_rx_sv); \ + if (RExC_open_parens) \ + SAVEFREEPV(RExC_open_parens); \ + if (RExC_close_parens) \ + SAVEFREEPV(RExC_close_parens); \ } STMT_END /* @@ -800,17 +819,22 @@ static const scan_data_t zero_scan_data = { #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE RExC_copy_start_in_constructed = NULL #define RESTORE_WARNINGS RExC_copy_start_in_constructed = RExC_precomp -/* Outputting warnings is generally deferred until the 2nd pass. This is - * because the first pass can be restarted, for example if the pattern has to - * be converted to UTF-8. If a warning had already been output earlier in the - * pass, it would be re-output after the restart. Pass 2 is never restarted, - * so the problem simply goes away if we defer the output to that pass. See - * [perl #122671]. 'RExC_copy_start_in_constructed' being NULL is a flag to - * not generate any warnings */ +/* Since a warning can be generated multiple times as the input is reparsed, we + * output it the first time we come to that point in the parse, but suppress it + * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not + * generate any warnings */ #define TO_OUTPUT_WARNINGS(loc) \ - (PASS2 && RExC_copy_start_in_constructed) + ( RExC_copy_start_in_constructed \ + && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset) -#define UPDATE_WARNINGS_LOC(loc) NOOP +/* After we've emitted a warning, we save the position in the input so we don't + * output it again */ +#define UPDATE_WARNINGS_LOC(loc) \ + STMT_START { \ + if (TO_OUTPUT_WARNINGS(loc)) { \ + RExC_latest_warn_offset = (xI(loc)) - RExC_precomp; \ + } \ + } STMT_END /* 'warns' is the output of the packWARNx macro used in 'code' */ #define _WARN_HELPER(loc, warns, code) \ @@ -941,11 +965,11 @@ static const scan_data_t zero_scan_data = { #define Set_Node_Offset_Length(node,offset,len) #define ProgLen(ri) ri->u.proglen #define SetProgLen(ri,x) ri->u.proglen = x +#define Track_Code(code) #else #define ProgLen(ri) ri->u.offsets[0] #define SetProgLen(ri,x) ri->u.offsets[0] = x #define Set_Node_Offset_To_R(offset,byte) STMT_START { \ - if (! SIZE_ONLY) { \ MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ __LINE__, (int)(offset), (int)(byte))); \ if((offset) < 0) { \ @@ -954,7 +978,6 @@ static const scan_data_t zero_scan_data = { } else { \ RExC_offsets[2*(offset)-1] = (byte); \ } \ - } \ } STMT_END #define Set_Node_Offset(node,byte) \ @@ -962,7 +985,6 @@ static const scan_data_t zero_scan_data = { #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) #define Set_Node_Length_To_R(node,len) STMT_START { \ - if (! SIZE_ONLY) { \ MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ __LINE__, (int)(node), (int)(len))); \ if((node) < 0) { \ @@ -971,7 +993,6 @@ static const scan_data_t zero_scan_data = { } else { \ RExC_offsets[2*(node)] = (len); \ } \ - } \ } STMT_END #define Set_Node_Length(node,len) \ @@ -987,6 +1008,8 @@ static const scan_data_t zero_scan_data = { Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \ Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \ } STMT_END + +#define Track_Code(code) STMT_START { code } STMT_END #endif #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS @@ -3583,9 +3606,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, optimisation. */ while( optimize < jumper ) { -#ifdef RE_TRACK_PATTERN_OFFSETS - mjd_nodelen += Node_Length((optimize)); -#endif + Track_Code( mjd_nodelen += Node_Length((optimize)); ); OP( optimize ) = OPTIMIZED; Set_Node_Offset_Length(optimize, 0, 0); optimize++; @@ -6328,8 +6349,10 @@ S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs) return; for (n = 0; n < cbs->count; n++) { REGEXP *rx = cbs->cb[n].src_regex; - cbs->cb[n].src_regex = NULL; - SvREFCNT_dec(rx); + if (rx) { + cbs->cb[n].src_regex = NULL; + SvREFCNT_dec_NN(rx); + } } Safefree(cbs->cb); Safefree(cbs); @@ -6373,7 +6396,8 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, DEBUG_PARSE_r(Perl_re_printf( aTHX_ "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); - Newx(dst, *plen_p * 2 + 1, U8); + /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */ + Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8); d = dst; while (s < *plen_p) { @@ -7068,7 +7092,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags) { REGEXP *Rx; /* Capital 'R' means points to a REGEXP */ - struct regexp *r; STRLEN plen; char *exp; regnode *scan; @@ -7232,7 +7255,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* ignore the utf8ness if the pattern is 0 length */ RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); - RExC_rx_sv = NULL; \ RExC_uni_semantics = 0; RExC_seen_unfolded_sharp_s = 0; RExC_contains_locale = 0; @@ -7243,7 +7265,16 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_frame_head= NULL; RExC_frame_last= NULL; RExC_frame_count= 0; + RExC_latest_warn_offset = 0; + RExC_use_BRANCHJ = 0; RExC_total_parens = 0; + RExC_open_parens = NULL; + RExC_close_parens = NULL; + RExC_paren_names = NULL; + RExC_size = 0; +#ifdef DEBUGGING + RExC_paren_name_list = NULL; +#endif DEBUG_r({ RExC_mysv1= sv_newmortal(); @@ -7287,6 +7318,12 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, return old_re; } + /* Allocate the pattern's SV */ + RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP); + RExC_rx = ReANY(Rx); + if ( RExC_rx == NULL ) + FAIL("Regexp out of space"); + rx_flags = orig_rx_flags; if ( initial_charset == REGEX_DEPENDS_CHARSET @@ -7298,8 +7335,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); } - RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp; - RExC_flags = rx_flags; RExC_pm_flags = pm_flags; if (runtime_code) { @@ -7323,43 +7358,82 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_maxlen = 0; RExC_in_lookbehind = 0; RExC_seen_zerolen = *exp == '^' ? -1 : 0; - RExC_extralen = 0; #ifdef EBCDIC RExC_recode_x_to_native = 0; #endif RExC_in_multi_char_class = 0; - /* First pass: determine size, legality. */ - RExC_pass1 = TRUE; - RExC_parse = exp; - RExC_start = RExC_copy_start_in_constructed = exp; - RExC_end = exp + plen; - RExC_precomp_end = RExC_end; - RExC_naughty = 0; - RExC_npar = 1; + RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp; + RExC_precomp_end = RExC_end = exp + plen; RExC_nestroot = 0; - RExC_size = 0L; - RExC_emit = 1; RExC_whilem_seen = 0; - RExC_open_parens = 0; - RExC_close_parens = 0; RExC_end_op = NULL; - RExC_paren_names = NULL; -#ifdef DEBUGGING - RExC_paren_name_list = NULL; -#endif RExC_recurse = NULL; RExC_study_chunk_recursed = NULL; RExC_study_chunk_recursed_bytes= 0; RExC_recurse_count = 0; pRExC_state->code_index = 0; - /* We allocate scratch space as large as the largest node, for use in the - * first pass. Since many functions return RExC_emit on success, and '0' - * if an error, RExC_emit must never be 0, so we set it to 1 and double - * the scratch space */ - Newxc(RExC_emit_start, 2 * sizeof(regnode_ssc), char, regnode); - SAVEFREEPV(RExC_emit_start); + /* Initialize the string in the compiled pattern. This is so that there is + * something to output if necessary */ + set_regex_pv(pRExC_state, Rx); + + DEBUG_PARSE_r({ + Perl_re_printf( aTHX_ + "Starting parse and generation\n"); + RExC_lastnum=0; + RExC_lastparse=NULL; + }); + + /* Allocate space and zero-initialize. Note, the two step process + of zeroing when in debug mode, thus anything assigned has to + happen after that */ + if (! RExC_size) { + + /* On the first pass of the parse, we guess how big this will be. Then + * we grow in one operation to that amount and then give it back. As + * we go along, we re-allocate what we need. + * + * XXX Currently the guess is essentially that the pattern will be an + * EXACT node with one byte input, one byte output. This is crude, and + * better heuristics are welcome. + * + * On any subsequent passes, we guess what we actually computed in the + * latest earlier pass. Such a pass probably didn't complete so is + * missing stuff. We could improve those guesses by knowing where the + * parse stopped, and use the length so far plus apply the above + * assumption to what's left. */ + RExC_size = STR_SZ(RExC_end - RExC_start); + } + + Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal); + if ( RExC_rxi == NULL ) + FAIL("Regexp out of space"); + + Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char); + RXi_SET( RExC_rx, RExC_rxi ); + + /* We start from 0 (over from 0 in the case this is a reparse. The first + * node parsed will give back any excess memory we have allocated so far). + * */ + RExC_size = 0; + + /* non-zero initialization begins here */ + RExC_rx->engine= eng; + RExC_rx->extflags = rx_flags; + RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK; + + if (pm_flags & PMf_IS_QR) { + RExC_rxi->code_blocks = pRExC_state->code_blocks; + if (RExC_rxi->code_blocks) { + RExC_rxi->code_blocks->refcnt++; + } + } + + RExC_rx->intflags = 0; + + RExC_flags = rx_flags; /* don't let top level (?i) bleed */ + RExC_parse = exp; /* This NUL is guaranteed because the pattern comes from an SV*, and the sv * code makes sure the final byte is an uncounted NUL. But should this @@ -7370,13 +7444,34 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, * etc. So it is worth noting. */ assert(*RExC_end == '\0'); - DEBUG_PARSE_r( - Perl_re_printf( aTHX_ "Starting first pass (sizing)\n"); - RExC_lastnum=0; - RExC_lastparse=NULL; - ); + RExC_naughty = 0; + RExC_npar = 1; + RExC_emit_start = RExC_rxi->program; + pRExC_state->code_index = 0; + + *((char*) RExC_emit_start) = (char) REG_MAGIC; + RExC_emit = 1; + + /* Do the parse */ + if (reg(pRExC_state, 0, &flags, 1)) { + + /* Success!, But if RExC_total_parens < 0, we need to redo the parse + * knowing how many parens there actually are */ + if (RExC_total_parens < 0) { + flags |= RESTART_PARSE; + } + + /* We have that number in RExC_npar */ + RExC_total_parens = RExC_npar; + } + else if (! MUST_RESTART(flags)) { + ReREFCNT_dec(Rx); + Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags); + } + + /* Here, we either have success, or we have to redo the parse for some reason */ + if (MUST_RESTART(flags)) { - if (reg(pRExC_state, 0, &flags, 1) == 0) { /* It's possible to write a regexp in ascii that represents Unicode codepoints outside of the byte range, such as via \x{100}. If we detect such a sequence we have to convert the entire pattern to utf8 @@ -7385,142 +7480,91 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, at least some part of the pattern, and therefore must convert the whole thing. -- dmq */ - if (MUST_RESTART(flags)) { - if (flags & NEED_UTF8) { - S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, - pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0); - DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n")); - } - else { - DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n")); + if (flags & NEED_UTF8) { + + /* We have stored the offset of the final warning output so far. + * That must be adjusted. Any variant characters between the start + * of the pattern and this warning count for 2 bytes in the final, + * so just add them again */ + if (UNLIKELY(RExC_latest_warn_offset > 0)) { + RExC_latest_warn_offset += + variant_under_utf8_count((U8 *) exp, (U8 *) exp + + RExC_latest_warn_offset); } + S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, + pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0); + DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n")); + } + else { + DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n")); + } - goto redo_parse; + if (RExC_total_parens > 0) { + /* Make enough room for all the known parens, and zero it */ + Renew(RExC_open_parens, RExC_total_parens, regnode_offset); + Zero(RExC_open_parens, RExC_total_parens, regnode_offset); + RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */ + + Renew(RExC_close_parens, RExC_total_parens, regnode_offset); + Zero(RExC_close_parens, RExC_total_parens, regnode_offset); + } + else { /* Parse did not complete. Reinitialize the parentheses + structures */ + RExC_total_parens = 0; + if (RExC_open_parens) { + Safefree(RExC_open_parens); + RExC_open_parens = NULL; + } + if (RExC_close_parens) { + Safefree(RExC_close_parens); + RExC_close_parens = NULL; + } } - Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile for sizing pass, flags=%#" UVxf, (UV) flags); - } - DEBUG_PARSE_r({ - Perl_re_printf( aTHX_ - "Required size %" IVdf " nodes\n" - "Starting second pass (creation)\n", - (IV)RExC_size); - RExC_lastnum=0; - RExC_lastparse=NULL; - }); + /* Clean up what we did in this parse */ + SvREFCNT_dec_NN(RExC_rx_sv); - /* The first pass could have found things that force Unicode semantics */ - if ((RExC_utf8 || RExC_uni_semantics) - && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET) - { - set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); + goto redo_parse; } - /* Small enough for pointer-storage convention? - If extralen==0, this means that we will not need long jumps. */ - if (RExC_size >= 0x10000L && RExC_extralen) - RExC_size += RExC_extralen; - else - RExC_extralen = 0; - if (RExC_whilem_seen > 15) - RExC_whilem_seen = 15; + /* Here, we have successfully parsed and generated the pattern's program + * for the regex engine. We are ready to finish things up and look for + * optimizations. */ - /* Allocate space and zero-initialize. Note, the two step process - of zeroing when in debug mode, thus anything assigned has to - happen after that */ - RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP); - RExC_rx = ReANY(Rx); - Newxc(RExC_rxi, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), - char, regexp_internal); - if ( RExC_rx == NULL || RExC_rxi == NULL ) - FAIL("Regexp out of space"); -#ifdef DEBUGGING - /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ - Zero(RExC_rxi, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), - char); -#else - /* bulk initialize base fields with 0. */ - Zero(RExC_rxi, sizeof(regexp_internal), char); -#endif - - /* non-zero initialization begins here */ - RXi_SET( RExC_rx, RExC_rxi ); - RExC_rx->engine= eng; - RExC_rx->extflags = rx_flags; - RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK; + SetProgLen(RExC_rxi,RExC_size); - if (pm_flags & PMf_IS_QR) { - RExC_rxi->code_blocks = pRExC_state->code_blocks; - if (RExC_rxi->code_blocks) - RExC_rxi->code_blocks->refcnt++; + /* The values for the two variables below are now immutable, we can add + * them to the list to free without overwhelming it */ + if (RExC_open_parens) { + SAVEFREEPV(RExC_open_parens); + } + if (RExC_close_parens) { + SAVEFREEPV(RExC_close_parens); } - /* Set up the string to compile, with correct modifiers, etc */ + /* Update the string to compile, with correct modifiers, etc */ set_regex_pv(pRExC_state, Rx); - RExC_rx->intflags = 0; - RExC_total_parens = RExC_npar; - RExC_rx->nparens = RExC_total_parens - 1; /* set early to validate backrefs */ + RExC_rx->nparens = RExC_total_parens - 1; + + /* Uses the upper 4 bits of the FLAGS field, so keep within that size */ + if (RExC_whilem_seen > 15) + RExC_whilem_seen = 15; + + DEBUG_PARSE_r({ + Perl_re_printf( aTHX_ + "Required size %" IVdf " nodes\n", (IV)RExC_size); + RExC_lastnum=0; + RExC_lastparse=NULL; + }); - /* Useful during FAIL. */ #ifdef RE_TRACK_PATTERN_OFFSETS - Newxz(RExC_offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ DEBUG_OFFSETS_r(Perl_re_printf( aTHX_ "%s %" UVuf " bytes for offset annotations.\n", RExC_offsets ? "Got" : "Couldn't get", (UV)((2*RExC_size+1) * sizeof(U32)))); #endif - SetProgLen(RExC_rxi, RExC_size); - RExC_rx_sv = Rx; - - /* Second pass: emit code. */ - RExC_pass1 = FALSE; - RExC_flags = rx_flags; /* don't let top level (?i) bleed */ - RExC_pm_flags = pm_flags; - RExC_parse = exp; - RExC_end = exp + plen; - RExC_naughty = 0; - RExC_emit_start = RExC_rxi->program; - RExC_emit = 1; - RExC_emit_bound = RExC_rxi->program + RExC_size + 1; - pRExC_state->code_index = 0; - - *((char*) RExC_emit_start) = (char) REG_MAGIC; - /* setup various meta data about recursion, this all requires - * RExC_npar to be correctly set, and a bit later on we clear it */ - if (RExC_seen & REG_RECURSE_SEEN) { - DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ - "%*s%*s Setting up open/close parens\n", - 22, "| |", (int)(0 * 2 + 1), "")); - - /* setup RExC_open_parens, which holds the address of each - * OPEN tag, and to make things simpler for the 0 index - * the start of the program - this is used later for offsets */ - Newxz(RExC_open_parens, RExC_npar, regnode_offset); - SAVEFREEPV(RExC_open_parens); - RExC_open_parens[0] = RExC_emit; - /* setup RExC_close_parens, which holds the address of each - * CLOSE tag, and to make things simpler for the 0 index - * the end of the program - this is used later for offsets */ - Newxz(RExC_close_parens, RExC_npar, regnode_offset); - SAVEFREEPV(RExC_close_parens); - /* we dont know where end op starts yet, so we dont - * need to set RExC_close_parens[0] like we do RExC_open_parens[0] above */ - - /* Note, RExC_npar is 1 + the number of parens in a pattern. - * So its 1 if there are no parens. */ - RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) + - ((RExC_npar & 0x07) != 0); - Newx(RExC_study_chunk_recursed, - RExC_study_chunk_recursed_bytes * RExC_npar, U8); - SAVEFREEPV(RExC_study_chunk_recursed); - } - RExC_npar = 1; - if (reg(pRExC_state, 0, &flags, 1) == 0) { - ReREFCNT_dec(Rx); - Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile for generation pass, flags=%#" UVxf, (UV) flags); - } DEBUG_OPTIMISE_r( Perl_re_printf( aTHX_ "Starting post parse optimization\n"); ); @@ -7533,6 +7577,16 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, SAVEFREEPV(RExC_recurse); } + if (RExC_seen & REG_RECURSE_SEEN) { + /* Note, RExC_total_parens is 1 + the number of parens in a pattern. + * So its 1 if there are no parens. */ + RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) + + ((RExC_total_parens & 0x07) != 0); + Newx(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_total_parens, U8); + SAVEFREEPV(RExC_study_chunk_recursed); + } + reStudy: RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0; DEBUG_r( @@ -8522,31 +8576,38 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) character */ vFAIL("Group name must start with a non-digit word character"); } - if ( flags ) { sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), SVs_TEMP | (UTF ? SVf_UTF8 : 0)); - if ( flags == REG_RSN_RETURN_NAME) - return sv_name; - else if (flags==REG_RSN_RETURN_DATA) { - HE *he_str = NULL; - SV *sv_dat = NULL; - if ( ! sv_name ) /* should not happen*/ - Perl_croak(aTHX_ "panic: no svname in reg_scan_name"); - if (RExC_paren_names) - he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 ); - if ( he_str ) - sv_dat = HeVAL(he_str); - if ( ! sv_dat ) /* Didn't find group */ + if ( flags == REG_RSN_RETURN_NAME) + return sv_name; + else if (flags==REG_RSN_RETURN_DATA) { + HE *he_str = NULL; + SV *sv_dat = NULL; + if ( ! sv_name ) /* should not happen*/ + Perl_croak(aTHX_ "panic: no svname in reg_scan_name"); + if (RExC_paren_names) + he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 ); + if ( he_str ) + sv_dat = HeVAL(he_str); + if ( ! sv_dat ) { /* Didn't find group */ + + /* It might be a forward reference; we can't fail until we + * know, by completing the parse to get all the groups, and + * then reparsing */ + if (RExC_total_parens > 0) { vFAIL("Reference to nonexistent named group"); - return sv_dat; - } - else { - Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name", - (unsigned long) flags); + } + else { + REQUIRE_PARENS_PASS; + } } - NOT_REACHED; /* NOTREACHED */ + return sv_dat; } - return NULL; + else { + Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name", + (unsigned long) flags); + } + NOT_REACHED; /* NOTREACHED */ } #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ @@ -8566,10 +8627,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) } else \ Perl_re_printf( aTHX_ "%16s",""); \ \ - if (SIZE_ONLY) \ - num = RExC_size + 1; \ - else \ - num=REG_NODE_NUM(REGNODE_p(RExC_emit)); \ + num=REG_NODE_NUM(REGNODE_p(RExC_emit)); \ if (RExC_lastnum!=num) \ Perl_re_printf( aTHX_ "|%4d", num); \ else \ @@ -10614,7 +10672,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) NOT_REACHED; /*NOTREACHED*/ case ONCE_PAT_MOD: /* 'o' */ case GLOBAL_PAT_MOD: /* 'g' */ - if (PASS2 && ckWARN(WARN_REGEXP)) { + if (ckWARN(WARN_REGEXP)) { const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; @@ -10634,7 +10692,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) break; case CONTINUE_PAT_MOD: /* 'c' */ - if (PASS2 && ckWARN(WARN_REGEXP)) { + if (ckWARN(WARN_REGEXP)) { if (! (wastedflags & WASTED_C) ) { wastedflags |= WASTED_GC; /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ @@ -10721,9 +10779,7 @@ S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, regnode_offset ret; char* name_start = RExC_parse; U32 num = 0; - SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY - ? REG_RSN_RETURN_NULL - : REG_RSN_RETURN_DATA); + SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA); GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF; @@ -11109,20 +11165,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) ret = reg2Lanode(pRExC_state, op, 0, internal_argval); } RExC_seen |= REG_VERBARG_SEEN; - if ( ! SIZE_ONLY ) { - if (start_arg) { - SV *sv = newSVpvn( start_arg, - RExC_parse - start_arg); - ARG(REGNODE_p(ret)) = add_data( pRExC_state, - STR_WITH_LEN("S")); - RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv; - FLAGS(REGNODE_p(ret)) = 1; - } else { - FLAGS(REGNODE_p(ret)) = 0; - } - if ( internal_argval != -1 ) - ARG2L_SET(REGNODE_p(ret), internal_argval); + if (start_arg) { + SV *sv = newSVpvn( start_arg, + RExC_parse - start_arg); + ARG(REGNODE_p(ret)) = add_data( pRExC_state, + STR_WITH_LEN("S")); + RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv; + FLAGS(REGNODE_p(ret)) = 1; + } else { + FLAGS(REGNODE_p(ret)) = 0; } + if ( internal_argval != -1 ) + ARG2L_SET(REGNODE_p(ret), internal_argval); nextchar(pRExC_state); return ret; } @@ -11183,10 +11237,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) /* FALLTHROUGH */ case '\'': /* (?'...') */ name_start = RExC_parse; - svname = reg_scan_name(pRExC_state, - SIZE_ONLY /* reverse test from the others */ - ? REG_RSN_RETURN_NAME - : REG_RSN_RETURN_NULL); + svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME); if ( RExC_parse == name_start || RExC_parse >= RExC_end || *RExC_parse != paren) @@ -11194,7 +11245,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) vFAIL2("Sequence (?%c... not terminated", paren=='>' ? '<' : paren); } - if (SIZE_ONLY) { + { HE *he_str; SV *sv_dat = NULL; if (!svname) /* shouldn't happen */ @@ -11286,6 +11337,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) buffers in alternations share the same numbers */ paren = ':'; after_freeze = freeze_paren = RExC_npar; + + /* XXX This construct currently requires an extra pass. + * Investigation would be required to see if that could be + * changed */ + REQUIRE_PARENS_PASS; break; case ':': /* (?:...) */ case '>': /* (?>...) */ @@ -11300,6 +11356,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) FAIL("Sequence (?R) not terminated"); num = 0; RExC_seen |= REG_RECURSE_SEEN; + + /* XXX These constructs currently require an extra pass. + * It probably could be changed */ + REQUIRE_PARENS_PASS; + *flagp |= POSTPONED; goto gen_recurse_regop; /*notreached*/ @@ -11308,9 +11369,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) parse_start = RExC_parse - 1; named_recursion: { - SV *sv_dat = reg_scan_name(pRExC_state, - SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); - num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; + SV *sv_dat = reg_scan_name(pRExC_state, + REG_RSN_RETURN_DATA); + num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; } if (RExC_parse >= RExC_end || *RExC_parse != ')') vFAIL("Sequence (?&... not terminated"); @@ -11372,8 +11433,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) */ num = RExC_npar + num; if (num < 1) { - RExC_parse++; - vFAIL("Reference to nonexistent group"); + + /* It might be a forward reference; we can't fail until + * we know, by completing the parse to get all the + * groups, and then reparsing */ + if (RExC_total_parens > 0) { + RExC_parse++; + vFAIL("Reference to nonexistent group"); + } + else { + REQUIRE_PARENS_PASS; + } } } else if ( paren == '+' ) { num = RExC_npar + num - 1; @@ -11388,18 +11458,27 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) */ ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count); - if (!SIZE_ONLY) { - if (num > (I32)RExC_rx->nparens) { - RExC_parse++; - vFAIL("Reference to nonexistent group"); - } - RExC_recurse_count++; - DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ - "%*s%*s Recurse #%" UVuf " to %" IVdf "\n", - 22, "| |", (int)(depth * 2 + 1), "", - (UV)ARG(REGNODE_p(ret)), - (IV)ARG2L(REGNODE_p(ret)))); + if (num >= RExC_npar) { + + /* It might be a forward reference; we can't fail until we + * know, by completing the parse to get all the groups, and + * then reparsing */ + if (RExC_total_parens > 0) { + if (num >= RExC_total_parens) { + RExC_parse++; + vFAIL("Reference to nonexistent group"); + } + } + else { + REQUIRE_PARENS_PASS; + } } + RExC_recurse_count++; + DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ + "%*s%*s Recurse #%" UVuf " to %" IVdf "\n", + 22, "| |", (int)(depth * 2 + 1), "", + (UV)ARG(REGNODE_p(ret)), + (IV)ARG2L(REGNODE_p(ret)))); RExC_seen |= REG_RECURSE_SEEN; Set_Node_Length(REGNODE_p(ret), @@ -11449,20 +11528,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) /* this is a pre-compiled code block (?{...}) */ cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index]; RExC_parse = RExC_start + cb->end; - if (!SIZE_ONLY) { o = cb->block; - if (cb->src_regex) { - n = add_data(pRExC_state, STR_WITH_LEN("rl")); - RExC_rxi->data->data[n] = - (void*)SvREFCNT_inc((SV*)cb->src_regex); - RExC_rxi->data->data[n+1] = (void*)o; - } - else { - n = add_data(pRExC_state, - (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1); - RExC_rxi->data->data[n] = (void*)o; - } - } + if (cb->src_regex) { + n = add_data(pRExC_state, STR_WITH_LEN("rl")); + RExC_rxi->data->data[n] = + (void*)SvREFCNT_inc((SV*)cb->src_regex); + RExC_rxi->data->data[n+1] = (void*)o; + } + else { + n = add_data(pRExC_state, + (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1); + RExC_rxi->data->data[n] = (void*)o; + } pRExC_state->code_index++; nextchar(pRExC_state); @@ -11477,9 +11554,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) * return value */ RExC_flags & RXf_PMf_COMPILETIME ); - if (!SIZE_ONLY) { - FLAGS(REGNODE_p(ret)) = 2; - } + FLAGS(REGNODE_p(ret)) = 2; REGTAIL(pRExC_state, ret, eval); /* deal with the length of this later - MJD */ return ret; @@ -11529,8 +11604,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) regnode_offset tail; ret = reg_node(pRExC_state, LOGICAL); - if (!SIZE_ONLY) - FLAGS(REGNODE_p(ret)) = 1; + FLAGS(REGNODE_p(ret)) = 1; tail = reg(pRExC_state, 1, &flag, depth+1); RETURN_FAIL_ON_RESTART(flag, flagp); @@ -11543,8 +11617,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) char ch = RExC_parse[0] == '<' ? '>' : '\''; char *name_start= RExC_parse++; U32 num = 0; - SV *sv_dat=reg_scan_name(pRExC_state, - SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA); if ( RExC_parse == name_start || RExC_parse >= RExC_end || *RExC_parse != ch) @@ -11595,20 +11668,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) SV *sv_dat; RExC_parse++; sv_dat = reg_scan_name(pRExC_state, - SIZE_ONLY - ? REG_RSN_RETURN_NULL - : REG_RSN_RETURN_DATA); - - /* we should only have a false sv_dat when - * SIZE_ONLY is true, and we always have false - * sv_dat when SIZE_ONLY is true. - * reg_scan_name() will VFAIL() if the name is - * unknown when SIZE_ONLY is false, and otherwise - * will return something, and when SIZE_ONLY is - * true, reg_scan_name() just parses the string, - * and doesnt return anything. (in theory) */ - assert(SIZE_ONLY ? !sv_dat : !!sv_dat); - + REG_RSN_RETURN_DATA); if (sv_dat) parno = 1 + *((I32 *)SvPVX(sv_dat)); } @@ -11688,9 +11748,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) } else REGTAIL(pRExC_state, ret, ender); +#if 0 /* Removing this doesn't cause failures in the test suite -- khw */ RExC_size++; /* XXX WHY do we need this?!! For large programs it seems to be required but I can't figure out why. -- dmq*/ +#endif return ret; } RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; @@ -11732,20 +11794,48 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) capturing_parens: parno = RExC_npar; RExC_npar++; + if (RExC_total_parens <= 0) { + /* If we are in our first pass through (and maybe only pass), + * we need to allocate memory for the capturing parentheses + * data structures. Since we start at npar=1, when it reaches + * 2, for the first time it has something to put in it. Above + * 2 means we extend what we already have */ + if (RExC_npar == 2) { + /* setup RExC_open_parens, which holds the address of each + * OPEN tag, and to make things simpler for the 0 index the + * start of the program - this is used later for offsets */ + Newxz(RExC_open_parens, RExC_npar, regnode_offset); + RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */ + + /* setup RExC_close_parens, which holds the address of each + * CLOSE tag, and to make things simpler for the 0 index + * the end of the program - this is used later for offsets + * */ + Newxz(RExC_close_parens, RExC_npar, regnode_offset); + /* we dont know where end op starts yet, so we dont need to + * set RExC_close_parens[0] like we do RExC_open_parens[0] + * above */ + } + else { + Renew(RExC_open_parens, RExC_npar, regnode_offset); + Zero(RExC_open_parens + RExC_npar - 1, 1, regnode_offset); + + Renew(RExC_close_parens, RExC_npar, regnode_offset); + Zero(RExC_close_parens + RExC_npar - 1, 1, regnode_offset); + } + } ret = reganode(pRExC_state, OPEN, parno); - if (!SIZE_ONLY ){ - if (!RExC_nestroot) - RExC_nestroot = parno; - if (RExC_open_parens && !RExC_open_parens[parno]) - { - DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ - "%*s%*s Setting open paren #%" IVdf " to %d\n", - 22, "| |", (int)(depth * 2 + 1), "", - (IV)parno, REG_NODE_NUM(REGNODE_p(ret)))); - RExC_open_parens[parno]= ret; - } - } + if (!RExC_nestroot) + RExC_nestroot = parno; + if (RExC_open_parens && !RExC_open_parens[parno]) + { + DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ + "%*s%*s Setting open paren #%" IVdf " to %d\n", + 22, "| |", (int)(depth * 2 + 1), "", + (IV)parno, REG_NODE_NUM(REGNODE_p(ret)))); + RExC_open_parens[parno]= ret; + } Set_Node_Length(REGNODE_p(ret), 1); /* MJD */ Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */ @@ -11781,8 +11871,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) Set_Node_Offset_To_R(br, parse_start-RExC_start); } have_branch = 1; - if (SIZE_ONLY) - RExC_extralen += 1; /* For BRANCHJ-BRANCH. */ } else if (paren == ':') { *flagp |= flags&SIMPLE; @@ -11803,8 +11891,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))), ender); } - if (SIZE_ONLY) - RExC_extralen += 2; /* Account for LONGJMP. */ nextchar(pRExC_state); if (freeze_paren) { if (RExC_npar > after_freeze) @@ -11864,21 +11950,19 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) break; case 0: ender = reg_node(pRExC_state, END); - if (!SIZE_ONLY) { - assert(!RExC_end_op); /* there can only be one! */ - RExC_end_op = REGNODE_p(ender); - if (RExC_close_parens) { - DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ - "%*s%*s Setting close paren #0 (END) to %d\n", - 22, "| |", (int)(depth * 2 + 1), "", - REG_NODE_NUM(REGNODE_p(ender)))); + assert(!RExC_end_op); /* there can only be one! */ + RExC_end_op = REGNODE_p(ender); + if (RExC_close_parens) { + DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ + "%*s%*s Setting close paren #0 (END) to %d\n", + 22, "| |", (int)(depth * 2 + 1), "", + REG_NODE_NUM(REGNODE_p(ender)))); - RExC_close_parens[0]= ender; - } + RExC_close_parens[0]= ender; } break; } - DEBUG_PARSE_r(if (!SIZE_ONLY) { + DEBUG_PARSE_r( DEBUG_PARSE_MSG("lsbr"); regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state); regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state); @@ -11889,10 +11973,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) (IV)REG_NODE_NUM(REGNODE_p(ender)), (IV)(ender - lastbr) ); - }); + ); REGTAIL(pRExC_state, lastbr, ender); - if (have_branch && !SIZE_ONLY) { + if (have_branch) { char is_nothing= 1; if (depth==1) RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; @@ -11924,7 +12008,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) br= PL_regkind[OP(ret_as_regnode)] != BRANCH ? regnext(ret_as_regnode) : ret_as_regnode; - DEBUG_PARSE_r(if (!SIZE_ONLY) { + DEBUG_PARSE_r( DEBUG_PARSE_MSG("NADA"); regprop(RExC_rx, RExC_mysv1, ret_as_regnode, NULL, pRExC_state); @@ -11937,7 +12021,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) (IV)REG_NODE_NUM(REGNODE_p(ender)), (IV)(ender - ret) ); - }); + ); OP(br)= NOTHING; if (OP(REGNODE_p(ender)) == TAIL) { NEXT_OFF(br)= 0; @@ -12042,9 +12126,6 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) } } - if (!first && SIZE_ONLY) - RExC_extralen += 1; /* BRANCHJ */ - *flagp = WORST; /* Tentatively. */ skip_to_be_ignored_text(pRExC_state, &RExC_parse, @@ -12067,6 +12148,11 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) /* FIXME adding one for every branch after the first is probably * excessive now we have TRIE support. (hv) */ MARK_NAUGHTY(1); + if ( chain > (SSize_t) BRANCH_MAX_OFFSET + && ! RExC_use_BRANCHJ) + { + REQUIRE_BRANCHJ(flagp, 0); + } REGTAIL(pRExC_state, chain, latest); } chain = latest; @@ -12183,11 +12269,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (max < min) { /* If can't match, warn and optimize to fail unconditionally */ reginsert(pRExC_state, OPFAIL, orig_emit, depth+1); - if (PASS2) { - ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); - NEXT_OFF(REGNODE_p(orig_emit)) = - regarglen[OPFAIL] + NODE_STEP_REGNODE; - } + ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); + NEXT_OFF(REGNODE_p(orig_emit)) = + regarglen[OPFAIL] + NODE_STEP_REGNODE; return ret; } else if (min == max && *RExC_parse == '?') @@ -12236,8 +12320,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to LONGJMP. */ REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING)); - if (SIZE_ONLY) - RExC_whilem_seen++, RExC_extralen += 3; + RExC_whilem_seen++; MARK_NAUGHTY_EXP(1, 4); /* compound interest */ } FLAGS(REGNODE_p(ret)) = 0; @@ -12246,10 +12329,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp = WORST; if (max > 0) *flagp |= HASWIDTH; - if (!SIZE_ONLY) { - ARG1_SET(REGNODE_p(ret), (U16)min); - ARG2_SET(REGNODE_p(ret), (U16)max); - } + ARG1_SET(REGNODE_p(ret), (U16)min); + ARG2_SET(REGNODE_p(ret), (U16)max); if (max == REG_INFTY) RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; @@ -12298,7 +12379,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto do_curly; } nest_check: - if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { + if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { ckWARN2reg(RExC_parse, "%" UTF8f " matches null string many times", UTF8fARG(UTF, (RExC_parse >= origparse @@ -12744,22 +12825,13 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; - /* Don't bother to check for downgrading in PASS1, as it doesn't make any - * sizing difference, and is extra work that is thrown away */ - if (downgradable && ! PASS2) { - downgradable = FALSE; - } - if (! len_passed_in) { if (UTF) { if (UVCHR_IS_INVARIANT(code_point)) { if (LOC || ! FOLD) { /* /l defers folding until runtime */ *character = (U8) code_point; } - else { /* Here is /i and not /l. (toFOLD() is defined on just - ASCII, which isn't the same thing as INVARIANT on - EBCDIC, but it works there, as the extra invariants - fold to themselves) */ + else { /* Here is /i and not /l. */ *character = toFOLD((U8) code_point); /* We can downgrade to an EXACT node if this character @@ -12848,15 +12920,14 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, } } - if (SIZE_ONLY) { - RExC_size += STR_SZ(len); + if (downgradable) { + change_engine_size(pRExC_state, STR_SZ(len)); } - else { - RExC_emit += STR_SZ(len); - STR_LEN(REGNODE_p(node)) = len; - if (! len_passed_in) { - Copy((char *) character, STRING(REGNODE_p(node)), len, char); - } + + RExC_emit += STR_SZ(len); + STR_LEN(REGNODE_p(node)) = len; + if (! len_passed_in) { + Copy((char *) character, STRING(REGNODE_p(node)), len, char); } *flagp |= HASWIDTH; @@ -12874,8 +12945,7 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, *flagp |= SIMPLE; } - /* The OP may not be well defined in PASS1 */ - if (PASS2 && OP(REGNODE_p(node)) == EXACTFL) { + if (OP(REGNODE_p(node)) == EXACTFL) { RExC_contains_locale = 1; } } @@ -13140,8 +13210,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* SBOL is shared with /^/ so we set the flags so we can tell * /\A/ from /^/ in split. We check ret because first pass we * have no regop struct to set the flags on. */ - if (PASS2) - FLAGS(REGNODE_p(ret)) = 1; + FLAGS(REGNODE_p(ret)) = 1; *flagp |= SIMPLE; goto finish_meta_pat; case 'G': @@ -13202,7 +13271,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp |= SIMPLE; if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') { FLAGS(REGNODE_p(ret)) = TRADITIONAL_BOUND; - if (PASS2 && op > BOUNDA) { /* /aa is same as /a */ + if (op > BOUNDA) { /* /aa is same as /a */ OP(REGNODE_p(ret)) = BOUNDA; } } @@ -13270,7 +13339,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) RExC_parse = endbrace; REQUIRE_UNI_RULES(flagp, 0); - if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */ + if (op >= BOUNDA) { /* /aa is same as /a */ OP(REGNODE_p(ret)) = BOUNDU; length += 4; @@ -13286,7 +13355,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } } - if (PASS2 && invert) { + if (invert) { OP(REGNODE_p(ret)) += NBOUND - BOUND; } goto finish_meta_pat; @@ -13351,9 +13420,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } ret = reg_node(pRExC_state, op); - if (! SIZE_ONLY) { - FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg); - } + FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg); *flagp |= HASWIDTH|SIMPLE; /* FALLTHROUGH */ @@ -13537,9 +13604,19 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) vFAIL("Unterminated \\g{...} pattern"); RExC_parse++; } - if (!SIZE_ONLY) { - if (num > (I32)RExC_rx->nparens) - vFAIL("Reference to nonexistent group"); + if (num >= (I32)RExC_npar) { + + /* It might be a forward reference; we can't fail until we + * know, by completing the parse to get all the groups, and + * then reparsing */ + if (RExC_total_parens > 0) { + if (num >= RExC_total_parens) { + vFAIL("Reference to nonexistent group"); + } + } + else { + REQUIRE_PARENS_PASS; + } } RExC_sawback = 1; ret = reganode(pRExC_state, @@ -13605,7 +13682,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* This allows us to fill a node with just enough spare so that if the final * character folds, its expansion is guaranteed to fit */ #define MAX_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE) - char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE+1]; char *s0; U8 upper_parse = MAX_NODE_STRING_SIZE; @@ -13621,6 +13697,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * created for the new category. */ U8 node_type = EXACT; + /* Assume node will be fully used; the excess is given back at the end */ + Ptrdiff_t initial_size = STR_SZ(256); + bool next_is_quantifier; char * oldp = NULL; @@ -13634,19 +13713,20 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * which don't participate in folds with Latin1-range characters, * as the latter's folds aren't known until runtime. (We don't * need to figure this out until pass 2) */ - bool maybe_exactfu = PASS2; + bool maybe_exactfu = TRUE; /* To see if RExC_uni_semantics changes during parsing of the node. * */ bool uni_semantics_at_node_start; - /* The node_type may change below, but since the size of the node - * doesn't change, it works */ - ret = reg_node(pRExC_state, node_type); + /* Allocate an EXACT node. The node_type may change below to + * another EXACTish node, but since the size of the node doesn't + * change, it works */ + ret = regnode_guts(pRExC_state, node_type, initial_size, "exact"); + FILL_NODE(ret, node_type); + RExC_emit++; - /* In pass1, folded, we use a temporary buffer instead of the - * actual node, as the node doesn't exist yet */ - s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(REGNODE_p(ret)); + s = STRING(REGNODE_p(ret)); s0 = s; @@ -14001,19 +14081,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (! FOLD) { /* The simple case, just append the literal */ - /* In the sizing pass, we need only the size of the - * character we are appending, hence we can delay getting - * its representation until PASS2. */ - if (SIZE_ONLY) { - if (UTF && ! UVCHR_IS_INVARIANT(ender)) { - const STRLEN unilen = UVCHR_SKIP(ender); - s += unilen; - added_len = unilen; - } - else { - s++; - } - } else { /* PASS2 */ not_fold_common: if (UTF && ! UVCHR_IS_INVARIANT(ender)) { U8 * new_s = uvchr_to_utf8((U8*)s, ender); @@ -14023,7 +14090,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) else { *(s++) = (char) ender; } - } } else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) { @@ -14402,6 +14468,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) loopdone: /* Jumped to when encounters something that shouldn't be in the node */ + change_engine_size(pRExC_state, - (initial_size - STR_SZ(len))); + /* I (khw) don't know if you can get here with zero length, but the * old code handled this situation by creating a zero-length EXACT * node. Might as well be NOTHING instead */ @@ -14458,7 +14526,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* Position parse to next real character */ skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE /* Don't force to /x */ ); - if ( PASS2 && *RExC_parse == '{' + if ( *RExC_parse == '{' && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse)) { if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) { @@ -15425,7 +15493,6 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, const bool save_fold = FOLD; /* Temporary */ char *save_end, *save_parse; /* Temporaries */ const bool in_locale = LOC; /* we turn off /l during processing */ - AV* posix_warnings = NULL; GET_RE_DEBUG_FLAGS_DECL; @@ -15441,117 +15508,6 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, * compile time values are valid in all runtime cases */ REQUIRE_UNI_RULES(flagp, 0); - /* This will return only an ANYOF regnode, or (unlikely) something smaller - * (such as EXACT). Thus we can skip most everything if just sizing. We - * call regclass to handle '[]' so as to not have to reinvent its parsing - * rules here (throwing away the size it computes each time). And, we exit - * upon an unescaped ']' that isn't one ending a regclass. To do both - * these things, we need to realize that something preceded by a backslash - * is escaped, so we have to keep track of backslashes */ - if (SIZE_ONLY) { - UV nest_depth = 0; /* how many nested (?[...]) constructs */ - - while (RExC_parse < RExC_end) { - SV* current = NULL; - - skip_to_be_ignored_text(pRExC_state, &RExC_parse, - TRUE /* Force /x */ ); - - switch (*RExC_parse) { - case '(': - if (RExC_parse[1] == '?' && RExC_parse[2] == '[') - nest_depth++, RExC_parse+=2; - /* FALLTHROUGH */ - default: - break; - case '\\': - /* Skip past this, so the next character gets skipped, after - * the switch */ - RExC_parse++; - if (*RExC_parse == 'c') { - /* Skip the \cX notation for control characters */ - RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; - } - break; - - case '[': - { - /* See if this is a [:posix:] class. */ - bool is_posix_class = (OOB_NAMEDCLASS - < handle_possible_posix(pRExC_state, - RExC_parse + 1, - NULL, - NULL, - TRUE /* checking only */)); - /* If it is a posix class, leave the parse pointer at the - * '[' to fool regclass() into thinking it is part of a - * '[[:posix:]]'. */ - if (! is_posix_class) { - RExC_parse++; - } - - /* regclass() can only return RESTART_PARSE and NEED_UTF8 - * if multi-char folds are allowed. */ - if (!regclass(pRExC_state, flagp, depth+1, - is_posix_class, /* parse the whole char - class only if not a - posix class */ - FALSE, /* don't allow multi-char folds */ - TRUE, /* silence non-portable warnings. */ - TRUE, /* strict */ - FALSE, /* Require return to be an ANYOF */ - ¤t, - &posix_warnings - )) - FAIL2("panic: regclass returned failure to handle_sets, " - "flags=%#" UVxf, (UV) *flagp); - - /* function call leaves parse pointing to the ']', except - * if we faked it */ - if (is_posix_class) { - RExC_parse--; - } - - SvREFCNT_dec(current); /* In case it returned something */ - break; - } - - case ']': - if (RExC_parse[1] == ')') { - RExC_parse++; - if (nest_depth--) break; - node = reganode(pRExC_state, ANYOF, 0); - nextchar(pRExC_state); - Set_Node_Length(REGNODE_p(node), - RExC_parse - oregcomp_parse + 1); /* MJD */ - if (in_locale) { - set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET); - } - - return node; - } - /* We output the messages even if warnings are off, because we'll fail - * the very next thing, and these give a likely diagnosis for that */ - if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) { - output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL); - } - RExC_parse++; - vFAIL("Unexpected ']' with no following ')' in (?[..."); - } - - RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; - } - - /* We output the messages even if warnings are off, because we'll fail - * the very next thing, and these give a likely diagnosis for that */ - if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) { - output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL); - } - - vFAIL("Syntax error in (?[...])"); - } - - /* Pass 2 only after this. */ ckWARNexperimental(RExC_parse, WARN_EXPERIMENTAL__REGEX_SETS, "The regex_sets feature is experimental"); @@ -15644,8 +15600,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, skip_to_be_ignored_text(pRExC_state, &RExC_parse, TRUE /* Force /x */ ); - if (RExC_parse >= RExC_end) { - Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'"); + if (RExC_parse >= RExC_end) { /* Fail */ + break; } curchar = UCHARAT(RExC_parse); @@ -15820,6 +15776,10 @@ redo_curchar: "flags=%#" UVxf, (UV) *flagp); } + if (! current) { + break; + } + /* function call leaves parse pointing to the ']', except if we * faked it */ if (is_posix_class) { @@ -15839,6 +15799,9 @@ redo_curchar: case ')': if (av_tindex_skip_len_mg(fence_stack) < 0) { + if (UCHARAT(RExC_parse - 1) == ']') { + break; + } RExC_parse++; vFAIL("Unexpected ')'"); } @@ -16025,6 +15988,9 @@ redo_curchar: default: RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + if (RExC_parse >= RExC_end) { + break; + } vFAIL("Unexpected character"); handle_operand: @@ -16084,7 +16050,18 @@ redo_curchar: RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; } /* End of loop parsing through the construct */ + vFAIL("Syntax error in (?[...])"); + done: + + if (RExC_parse >= RExC_end || RExC_parse[1] != ')') { + if (RExC_parse < RExC_end) { + RExC_parse++; + } + + vFAIL("Unexpected ']' with no following ')' in (?[..."); + } + if (av_tindex_skip_len_mg(fence_stack) >= 0) { vFAIL("Unmatched ("); } @@ -16368,8 +16345,10 @@ S_output_or_return_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_war PREPARE_TO_DIE; } } - Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", + if (TO_OUTPUT_WARNINGS(RExC_parse)) { + Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg)); + } SvREFCNT_dec_NN(msg); } } @@ -16576,7 +16555,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const regnode_offset orig_emit = RExC_emit; /* Save the original RExC_emit in case we need to change the emitted regop to an EXACT. */ const char * orig_parse = RExC_parse; - const SSize_t orig_size = RExC_size; bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */ /* This variable is used to mark where the end in the input is of something @@ -16587,8 +16565,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, char *not_posix_region_end = RExC_parse - 1; AV* posix_warnings = NULL; - const bool do_posix_warnings = return_posix_warnings - || (PASS2 && ckWARN(WARN_REGEXP)); + const bool do_posix_warnings = return_posix_warnings || ckWARN(WARN_REGEXP); U8 op = END; /* The returned node-type, initialized to an impossible one. */ U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */ @@ -16610,14 +16587,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, allow_multi_folds = FALSE; #endif - if (SIZE_ONLY) { - listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */ - } - else { - listsv = newSVpvs_flags("# comment\n", SVs_TEMP); - initial_listsv_len = SvCUR(listsv); - SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ - } + listsv = newSVpvs_flags("# comment\n", SVs_TEMP); + initial_listsv_len = SvCUR(listsv); + SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); @@ -16638,7 +16610,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, ¬_posix_region_end, NULL, TRUE /* checking only */); - if (PASS2 && maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) { + if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) { ckWARN4reg(not_posix_region_end, "POSIX syntax [%c %c] belongs inside character classes%s", *RExC_parse, *RExC_parse, @@ -16920,7 +16892,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, e = RExC_parse; n = 1; } - if (!SIZE_ONLY) { + { char* name = RExC_parse; char* base_name; /* name after any packages are stripped */ char* lookup_name = NULL; @@ -17111,7 +17083,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } } } - } /* End of actually getting the values in pass 2 */ + } RExC_parse = e + 1; namedclass = ANYOF_UNIPROP; /* no official name, but it's @@ -17182,8 +17154,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; vFAIL("Need exactly 3 octal digits"); } - else if (! SIZE_ONLY /* like \08, \178 */ - && numlen < 3 + else if ( numlen < 3 /* like \08, \178 */ && RExC_parse < RExC_end && isDIGIT(*RExC_parse) && ckWARN(WARN_REGEXP)) @@ -17198,7 +17169,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } default: /* Allow \_ to not give an error */ - if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') { + if (isWORDCHAR(value) && value != '_') { if (strict) { vFAIL2("Unrecognized escape \\%c in character class", (int)value); @@ -17222,24 +17193,22 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * literal, as is the character that began the false range, i.e. * the 'a' in the examples */ if (range) { - if (!SIZE_ONLY) { - const int w = (RExC_parse >= rangebegin) - ? RExC_parse - rangebegin - : 0; - if (strict) { - vFAIL2utf8f( - "False [] range \"%" UTF8f "\"", - UTF8fARG(UTF, w, rangebegin)); - } - else { - ckWARN2reg(RExC_parse, - "False [] range \"%" UTF8f "\"", - UTF8fARG(UTF, w, rangebegin)); - cp_list = add_cp_to_invlist(cp_list, '-'); - cp_foldable_list = add_cp_to_invlist(cp_foldable_list, - prevvalue); - } - } + const int w = (RExC_parse >= rangebegin) + ? RExC_parse - rangebegin + : 0; + if (strict) { + vFAIL2utf8f( + "False [] range \"%" UTF8f "\"", + UTF8fARG(UTF, w, rangebegin)); + } + else { + ckWARN2reg(RExC_parse, + "False [] range \"%" UTF8f "\"", + UTF8fARG(UTF, w, rangebegin)); + cp_list = add_cp_to_invlist(cp_list, '-'); + cp_foldable_list = add_cp_to_invlist(cp_foldable_list, + prevvalue); + } range = 0; /* this was not a true range */ element_count += 2; /* So counts for three values */ @@ -17296,32 +17265,30 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* The above-Latin1 characters are not subject to locale rules. * Just add them, in the second pass, to the * unconditionally-matched list */ - if (! SIZE_ONLY) { - - /* Get the list of the above-Latin1 code points this - * matches */ - _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1, - PL_XPosix_ptrs[classnum], - - /* Odd numbers are complements, like - * NDIGIT, NASCII, ... */ - namedclass % 2 != 0, - &scratch_list); - /* Checking if 'cp_list' is NULL first saves an extra - * clone. Its reference count will be decremented at the - * next union, etc, or if this is the only instance, at the - * end of the routine */ - if (! cp_list) { - cp_list = scratch_list; - } - else { - _invlist_union(cp_list, scratch_list, &cp_list); - SvREFCNT_dec_NN(scratch_list); - } - continue; /* Go get next character */ + + /* Get the list of the above-Latin1 code points this + * matches */ + _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1, + PL_XPosix_ptrs[classnum], + + /* Odd numbers are complements, like + * NDIGIT, NASCII, ... */ + namedclass % 2 != 0, + &scratch_list); + /* Checking if 'cp_list' is NULL first saves an extra + * clone. Its reference count will be decremented at the + * next union, etc, or if this is the only instance, at the + * end of the routine */ + if (! cp_list) { + cp_list = scratch_list; } + else { + _invlist_union(cp_list, scratch_list, &cp_list); + SvREFCNT_dec_NN(scratch_list); + } + continue; /* Go get next character */ } - else if (! SIZE_ONLY) { + else { /* Here, not in pass1 (in that pass we skip calculating the * contents of this class), and is not /l, or is a POSIX class @@ -17439,7 +17406,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* a bad range like \w-, [:word:]- ? */ if (namedclass > OOB_NAMEDCLASS) { - if (strict || (PASS2 && ckWARN(WARN_REGEXP))) { + if (strict || ckWARN(WARN_REGEXP)) { const int w = RExC_parse >= rangebegin ? RExC_parse - rangebegin : 0; @@ -17453,9 +17420,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, w, w, rangebegin); } } - if (!SIZE_ONLY) { - cp_list = add_cp_to_invlist(cp_list, '-'); - } + cp_list = add_cp_to_invlist(cp_list, '-'); element_count++; } else range = 1; /* yeah, it's a range! */ @@ -17541,7 +17506,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } } - if (strict && PASS2 && ckWARN(WARN_REGEXP)) { + if (strict && ckWARN(WARN_REGEXP)) { if (range) { /* If the range starts above 255, everything is portable and @@ -17673,48 +17638,45 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } /* Deal with this element of the class */ - if (! SIZE_ONLY) { #ifndef EBCDIC - cp_foldable_list = _add_range_to_invlist(cp_foldable_list, - prevvalue, value); + cp_foldable_list = _add_range_to_invlist(cp_foldable_list, + prevvalue, value); #else - /* On non-ASCII platforms, for ranges that span all of 0..255, and - * ones that don't require special handling, we can just add the - * range like we do for ASCII platforms */ - if ((UNLIKELY(prevvalue == 0) && value >= 255) - || ! (prevvalue < 256 - && (unicode_range - || (! non_portable_endpoint - && ((isLOWER_A(prevvalue) && isLOWER_A(value)) - || (isUPPER_A(prevvalue) - && isUPPER_A(value))))))) - { - cp_foldable_list = _add_range_to_invlist(cp_foldable_list, - prevvalue, value); + /* On non-ASCII platforms, for ranges that span all of 0..255, and ones + * that don't require special handling, we can just add the range like + * we do for ASCII platforms */ + if ((UNLIKELY(prevvalue == 0) && value >= 255) + || ! (prevvalue < 256 + && (unicode_range + || (! non_portable_endpoint + && ((isLOWER_A(prevvalue) && isLOWER_A(value)) + || (isUPPER_A(prevvalue) + && isUPPER_A(value))))))) + { + cp_foldable_list = _add_range_to_invlist(cp_foldable_list, + prevvalue, value); + } + else { + /* Here, requires special handling. This can be because it is a + * range whose code points are considered to be Unicode, and so + * must be individually translated into native, or because its a + * subrange of 'A-Z' or 'a-z' which each aren't contiguous in + * EBCDIC, but we have defined them to include only the "expected" + * upper or lower case ASCII alphabetics. Subranges above 255 are + * the same in native and Unicode, so can be added as a range */ + U8 start = NATIVE_TO_LATIN1(prevvalue); + unsigned j; + U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255; + for (j = start; j <= end; j++) { + cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j)); } - else { - /* Here, requires special handling. This can be because it is - * a range whose code points are considered to be Unicode, and - * so must be individually translated into native, or because - * its a subrange of 'A-Z' or 'a-z' which each aren't - * contiguous in EBCDIC, but we have defined them to include - * only the "expected" upper or lower case ASCII alphabetics. - * Subranges above 255 are the same in native and Unicode, so - * can be added as a range */ - U8 start = NATIVE_TO_LATIN1(prevvalue); - unsigned j; - U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255; - for (j = start; j <= end; j++) { - cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j)); - } - if (value > 255) { - cp_foldable_list = _add_range_to_invlist(cp_foldable_list, - 256, value); - } + if (value > 255) { + cp_foldable_list = _add_range_to_invlist(cp_foldable_list, + 256, value); } -#endif } +#endif range = 0; /* this range (if it was one) is done now */ } /* End of loop through all the text within the brackets */ @@ -17990,21 +17952,19 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * the parse */ const char * cur_parse = RExC_parse; RExC_parse = (char *)orig_parse; - if (PL_regkind[op] == POSIXD) { - if (op == POSIXL) { - RExC_contains_locale = 1; - } - if (invert) { - op += NPOSIXD - POSIXD; - } + if (PL_regkind[op] == POSIXD) { + if (op == POSIXL) { + RExC_contains_locale = 1; + } + if (invert) { + op += NPOSIXD - POSIXD; } + } ret = reg_node(pRExC_state, op); if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) { - if (! SIZE_ONLY) { - FLAGS(REGNODE_p(ret)) = arg; - } + FLAGS(REGNODE_p(ret)) = arg; *flagp |= HASWIDTH|SIMPLE; } else if (PL_regkind[op] == EXACT) { @@ -18027,20 +17987,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } } - /* Assume we are going to generate an ANYOF-type node. */ - op = (posixl) - ? ANYOFPOSIXL - : (LOC) - ? ANYOFL - : ANYOF; - ret = reganode(pRExC_state, op, 0); - - if (SIZE_ONLY) { - return ret; - } - - /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/ - /* If folding, we calculate all characters that could fold to or from the * ones already on the list */ if (cp_foldable_list) { @@ -18422,19 +18368,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } if (ret_invlist) { - assert(cp_list); - *ret_invlist = cp_list; SvREFCNT_dec(swash); - /* Discard the generated node */ - if (SIZE_ONLY) { - RExC_size = orig_size; - } - else { - RExC_emit = orig_emit; - } - return orig_emit; + return RExC_emit; } /* Some character classes are equivalent to other nodes. Such nodes take @@ -18727,13 +18664,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } /* It's going to be an ANYOF node. */ - OP(REGNODE_p(ret)) = (use_anyofd) + op = (use_anyofd) ? ANYOFD : ((posixl) ? ANYOFPOSIXL : ((LOC) ? ANYOFL : ANYOF)); + ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof"); + FILL_NODE(ret, op); /* We set the argument later */ + RExC_emit += 1 + regarglen[op]; ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags; /* Here, contains all the code points we can determine at @@ -19233,6 +19173,28 @@ S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size) PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE; RExC_size += size; + + Renewc(RExC_rxi, + sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode), + /* +1 for REG_MAGIC */ + char, + regexp_internal); + if ( RExC_rxi == NULL ) + FAIL("Regexp out of space"); + RXi_SET(RExC_rx, RExC_rxi); + + RExC_emit_start = RExC_rxi->program; + if (size > 0) { + Zero(REGNODE_p(RExC_emit), size, regnode); + } + +#ifdef RE_TRACK_PATTERN_OFFSETS + Renew(RExC_offsets, 2*RExC_size+1, U32); + if (size > 0) { + Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32); + } + RExC_offsets[0] = 2*RExC_size+1; +#endif } STATIC regnode_offset @@ -19250,21 +19212,15 @@ S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_ PERL_ARGS_ASSERT_REGNODE_GUTS; - assert(extra_size >= regarglen[op]); - - if (SIZE_ONLY) { - SIZE_ALIGN(RExC_size); + SIZE_ALIGN(RExC_size); change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size); - return(ret); - } - if (REGNODE_p(RExC_emit) >= RExC_emit_bound) - Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", - op, (void*)REGNODE_p(RExC_emit), (void*)RExC_emit_bound); - NODE_ALIGN_FILL(REGNODE_p(ret)); #ifndef RE_TRACK_PATTERN_OFFSETS PERL_UNUSED_ARG(name); + PERL_UNUSED_ARG(op); #else + assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF); + if (RExC_offsets) { /* MJD */ MJD_OFFSET_DEBUG( ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n", @@ -19288,16 +19244,14 @@ STATIC regnode_offset /* Location. */ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) { const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node"); + regnode_offset ptr = ret; PERL_ARGS_ASSERT_REG_NODE; assert(regarglen[op] == 0); - if (PASS2) { - regnode_offset ptr = ret; - FILL_ADVANCE_NODE(ptr, op); - RExC_emit = ptr; - } + FILL_ADVANCE_NODE(ptr, op); + RExC_emit = ptr; return(ret); } @@ -19308,17 +19262,15 @@ STATIC regnode_offset /* Location. */ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) { const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode"); + regnode_offset ptr = ret; PERL_ARGS_ASSERT_REGANODE; /* ANYOF are special cased to allow non-length 1 args */ - assert(regarglen[op] == 1 || PL_regkind[op] == ANYOF); + assert(regarglen[op] == 1); - if (PASS2) { - regnode_offset ptr = ret; - FILL_ADVANCE_NODE_ARG(ptr, op, arg); - RExC_emit = ptr; - } + FILL_ADVANCE_NODE_ARG(ptr, op, arg); + RExC_emit = ptr; return(ret); } @@ -19328,16 +19280,14 @@ S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const /* emit a node with U32 and I32 arguments */ const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode"); + regnode_offset ptr = ret; PERL_ARGS_ASSERT_REG2LANODE; assert(regarglen[op] == 2); - if (PASS2) { - regnode_offset ptr = ret; - FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2); - RExC_emit = ptr; - } + FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2); + RExC_emit = ptr; return(ret); } @@ -19350,9 +19300,8 @@ S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const * IMPORTANT NOTE - it is the *callers* responsibility to correctly * set up NEXT_OFF() of the inserted node if needed. Something like this: * -* reginsert(pRExC, OPFAIL, orig_emit, depth+1); -* if (PASS2) -* NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE; +* reginsert(pRExC, OPFAIL, orig_emit, depth+1); +* NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE; * * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well. */ @@ -19372,13 +19321,10 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op, PERL_UNUSED_ARG(depth); /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]); - if (SIZE_ONLY) { - RExC_size += size; - return; - } assert(!RExC_study_started); /* I believe we should never use reginsert once we have started studying. If this is wrong then we need to adjust RExC_recurse below like we do with RExC_open_parens/RExC_close_parens. */ + change_engine_size(pRExC_state, (Ptrdiff_t) size); src = REGNODE_p(RExC_emit); RExC_emit += size; dst = REGNODE_p(RExC_emit); @@ -19472,9 +19418,6 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state, PERL_UNUSED_ARG(depth); #endif - if (SIZE_ONLY) - return; - /* Find last node. */ scan = (regnode_offset) p; for (;;) { @@ -19532,9 +19475,6 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, PERL_ARGS_ASSERT_REGTAIL_STUDY; - if (SIZE_ONLY) - return exact; - /* Find last node. */ scan = p; @@ -20275,6 +20215,9 @@ Perl_pregfree2(pTHX_ REGEXP *rx) PERL_ARGS_ASSERT_PREGFREE2; + if (! r) + return; + if (r->mother_re) { ReREFCNT_dec(r->mother_re); } else { @@ -20420,6 +20363,10 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) PERL_ARGS_ASSERT_REGFREE_INTERNAL; + if (! ri) { + return; + } + DEBUG_COMPILE_r({ if (!PL_colorset) reginitcolors();