corresponding to copy_start */
SSize_t whilem_seen; /* number of WHILEM in this expr */
regnode *emit_start; /* Start of emitted-code area */
- regnode *emit_bound; /* First regnode outside of the
- allocated space */
regnode_offset emit; /* Code-emit pointer */
I32 naughty; /* How bad is this pattern? */
I32 sawback; /* Did we see \1, ...? */
U32 seen;
SSize_t size; /* Number of regnode equivalents in
pattern */
- I32 npar; /* Capture buffer count, (OPEN) plus
- one. ("par" 0 is the whole
- pattern)*/
- I32 total_par; /* Capture buffer count after parse
- completed, (OPEN) plus one. ("par" 0
- is the whole pattern)*/
+
+ /* position beyond 'precomp' of the warning message furthest away from
+ * 'precomp'. During the parse, no warnings are raised for any problems
+ * earlier in the parse than this position. This works if warnings are
+ * raised the first time a given spot is parsed, and if only one
+ * independent warning is raised for any given spot */
+ Size_t latest_warn_offset;
+
+ I32 npar; /* Capture buffer count so far in the
+ parse, (OPEN) plus one. ("par" 0 is
+ the whole pattern)*/
+ I32 total_par; /* During initial parse, is either 0,
+ or -1; the latter indicating a
+ reparse is needed. After that pass,
+ it is what 'npar' became after the
+ pass. Hence, it being > 0 indicates
+ we are in a reparse situation */
I32 nestroot; /* root parens we are in - used by
accept */
- I32 extralen;
I32 seen_zerolen;
regnode_offset *open_parens; /* offsets to open parens */
regnode_offset *close_parens; /* offsets to close parens */
bool strict;
bool study_started;
bool in_script_run;
- bool pass1;
+ bool use_BRANCHJ;
};
#define RExC_flags (pRExC_state->flags)
#define RExC_start (pRExC_state->start)
#define RExC_end (pRExC_state->end)
#define RExC_parse (pRExC_state->parse)
+#define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
#define RExC_whilem_seen (pRExC_state->whilem_seen)
/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
others */
#endif
#define RExC_emit (pRExC_state->emit)
-#define RExC_pass1 (pRExC_state->pass1)
#define RExC_emit_start (pRExC_state->emit_start)
-#define RExC_emit_bound (pRExC_state->emit_bound)
#define RExC_sawback (pRExC_state->sawback)
#define RExC_seen (pRExC_state->seen)
#define RExC_size (pRExC_state->size)
#define RExC_npar (pRExC_state->npar)
#define RExC_total_parens (pRExC_state->total_par)
#define RExC_nestroot (pRExC_state->nestroot)
-#define RExC_extralen (pRExC_state->extralen)
#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
#define RExC_utf8 (pRExC_state->utf8)
#define RExC_uni_semantics (pRExC_state->uni_semantics)
#define RExC_study_started (pRExC_state->study_started)
#define RExC_warn_text (pRExC_state->warn_text)
#define RExC_in_script_run (pRExC_state->in_script_run)
-#define RExC_use_BRANCHJ (!SIZE_ONLY && RExC_extralen)
+#define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
* a flag to disable back-off on the fixed/floating substrings - if it's
#define REQUIRE_UTF8(flagp) STMT_START { \
if (!UTF) { \
- assert(PASS1); \
*flagp = RESTART_PARSE|NEED_UTF8; \
return 0; \
} \
#define REQUIRE_UNI_RULES(flagp, restart_retval) \
STMT_START { \
if (DEPENDS_SEMANTICS) { \
- assert(PASS1); \
set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
RExC_uni_semantics = 1; \
- if (RExC_seen_unfolded_sharp_s) { \
*flagp |= RESTART_PARSE; \
return restart_retval; \
- } \
} \
} STMT_END
+#define BRANCH_MAX_OFFSET U16_MAX
+#define REQUIRE_BRANCHJ(flagp, restart_retval) \
+ STMT_START { \
+ RExC_use_BRANCHJ = 1; \
+ *flagp |= RESTART_PARSE; \
+ return restart_retval; \
+ } STMT_END
+
+#define REQUIRE_PARENS_PASS \
+ STMT_START { \
+ if (RExC_total_parens == 0) RExC_total_parens = -1; \
+ } STMT_END
+
/* Executes a return statement with the value 'X', if 'flags' contains any of
* 'RESTART_PARSE', 'NEED_UTF8', or 'extra'. If so, *flagp is set to those
* flags */
#define REPORT_LOCATION_ARGS(xC) \
UTF8fARG(UTF, \
(xI(xC) > eI) /* Don't run off end */ \
- ? eC - sC /* Length before the <--HERE */ \
+ ? eI - sI /* Length before the <--HERE */ \
: ((xI_offset(xC) >= 0) \
? xI_offset(xC) \
: (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
STMT_START { \
if (RExC_rx_sv) \
SAVEFREESV(RExC_rx_sv); \
+ if (RExC_open_parens) \
+ SAVEFREEPV(RExC_open_parens); \
+ if (RExC_close_parens) \
+ SAVEFREEPV(RExC_close_parens); \
} STMT_END
/*
#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE RExC_copy_start_in_constructed = NULL
#define RESTORE_WARNINGS RExC_copy_start_in_constructed = RExC_precomp
-/* Outputting warnings is generally deferred until the 2nd pass. This is
- * because the first pass can be restarted, for example if the pattern has to
- * be converted to UTF-8. If a warning had already been output earlier in the
- * pass, it would be re-output after the restart. Pass 2 is never restarted,
- * so the problem simply goes away if we defer the output to that pass. See
- * [perl #122671]. 'RExC_copy_start_in_constructed' being NULL is a flag to
- * not generate any warnings */
+/* Since a warning can be generated multiple times as the input is reparsed, we
+ * output it the first time we come to that point in the parse, but suppress it
+ * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
+ * generate any warnings */
#define TO_OUTPUT_WARNINGS(loc) \
- (PASS2 && RExC_copy_start_in_constructed)
+ ( RExC_copy_start_in_constructed \
+ && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
-#define UPDATE_WARNINGS_LOC(loc) NOOP
+/* After we've emitted a warning, we save the position in the input so we don't
+ * output it again */
+#define UPDATE_WARNINGS_LOC(loc) \
+ STMT_START { \
+ if (TO_OUTPUT_WARNINGS(loc)) { \
+ RExC_latest_warn_offset = (xI(loc)) - RExC_precomp; \
+ } \
+ } STMT_END
/* 'warns' is the output of the packWARNx macro used in 'code' */
#define _WARN_HELPER(loc, warns, code) \
#define Set_Node_Offset_Length(node,offset,len)
#define ProgLen(ri) ri->u.proglen
#define SetProgLen(ri,x) ri->u.proglen = x
+#define Track_Code(code)
#else
#define ProgLen(ri) ri->u.offsets[0]
#define SetProgLen(ri,x) ri->u.offsets[0] = x
#define Set_Node_Offset_To_R(offset,byte) STMT_START { \
- if (! SIZE_ONLY) { \
MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
__LINE__, (int)(offset), (int)(byte))); \
if((offset) < 0) { \
} else { \
RExC_offsets[2*(offset)-1] = (byte); \
} \
- } \
} STMT_END
#define Set_Node_Offset(node,byte) \
#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
#define Set_Node_Length_To_R(node,len) STMT_START { \
- if (! SIZE_ONLY) { \
MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
__LINE__, (int)(node), (int)(len))); \
if((node) < 0) { \
} else { \
RExC_offsets[2*(node)] = (len); \
} \
- } \
} STMT_END
#define Set_Node_Length(node,len) \
Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
} STMT_END
+
+#define Track_Code(code) STMT_START { code } STMT_END
#endif
#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
optimisation.
*/
while( optimize < jumper ) {
-#ifdef RE_TRACK_PATTERN_OFFSETS
- mjd_nodelen += Node_Length((optimize));
-#endif
+ Track_Code( mjd_nodelen += Node_Length((optimize)); );
OP( optimize ) = OPTIMIZED;
Set_Node_Offset_Length(optimize, 0, 0);
optimize++;
return;
for (n = 0; n < cbs->count; n++) {
REGEXP *rx = cbs->cb[n].src_regex;
- cbs->cb[n].src_regex = NULL;
- SvREFCNT_dec(rx);
+ if (rx) {
+ cbs->cb[n].src_regex = NULL;
+ SvREFCNT_dec_NN(rx);
+ }
}
Safefree(cbs->cb);
Safefree(cbs);
DEBUG_PARSE_r(Perl_re_printf( aTHX_
"UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
- Newx(dst, *plen_p * 2 + 1, U8);
+ /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
+ Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
d = dst;
while (s < *plen_p) {
bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
{
REGEXP *Rx; /* Capital 'R' means points to a REGEXP */
- struct regexp *r;
STRLEN plen;
char *exp;
regnode *scan;
/* ignore the utf8ness if the pattern is 0 length */
RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
- RExC_rx_sv = NULL; \
RExC_uni_semantics = 0;
RExC_seen_unfolded_sharp_s = 0;
RExC_contains_locale = 0;
RExC_frame_head= NULL;
RExC_frame_last= NULL;
RExC_frame_count= 0;
+ RExC_latest_warn_offset = 0;
+ RExC_use_BRANCHJ = 0;
RExC_total_parens = 0;
+ RExC_open_parens = NULL;
+ RExC_close_parens = NULL;
+ RExC_paren_names = NULL;
+ RExC_size = 0;
+#ifdef DEBUGGING
+ RExC_paren_name_list = NULL;
+#endif
DEBUG_r({
RExC_mysv1= sv_newmortal();
return old_re;
}
+ /* Allocate the pattern's SV */
+ RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
+ RExC_rx = ReANY(Rx);
+ if ( RExC_rx == NULL )
+ FAIL("Regexp out of space");
+
rx_flags = orig_rx_flags;
if ( initial_charset == REGEX_DEPENDS_CHARSET
set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
}
- RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
- RExC_flags = rx_flags;
RExC_pm_flags = pm_flags;
if (runtime_code) {
RExC_maxlen = 0;
RExC_in_lookbehind = 0;
RExC_seen_zerolen = *exp == '^' ? -1 : 0;
- RExC_extralen = 0;
#ifdef EBCDIC
RExC_recode_x_to_native = 0;
#endif
RExC_in_multi_char_class = 0;
- /* First pass: determine size, legality. */
- RExC_pass1 = TRUE;
- RExC_parse = exp;
- RExC_start = RExC_copy_start_in_constructed = exp;
- RExC_end = exp + plen;
- RExC_precomp_end = RExC_end;
- RExC_naughty = 0;
- RExC_npar = 1;
+ RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
+ RExC_precomp_end = RExC_end = exp + plen;
RExC_nestroot = 0;
- RExC_size = 0L;
- RExC_emit = 1;
RExC_whilem_seen = 0;
- RExC_open_parens = 0;
- RExC_close_parens = 0;
RExC_end_op = NULL;
- RExC_paren_names = NULL;
-#ifdef DEBUGGING
- RExC_paren_name_list = NULL;
-#endif
RExC_recurse = NULL;
RExC_study_chunk_recursed = NULL;
RExC_study_chunk_recursed_bytes= 0;
RExC_recurse_count = 0;
pRExC_state->code_index = 0;
- /* We allocate scratch space as large as the largest node, for use in the
- * first pass. Since many functions return RExC_emit on success, and '0'
- * if an error, RExC_emit must never be 0, so we set it to 1 and double
- * the scratch space */
- Newxc(RExC_emit_start, 2 * sizeof(regnode_ssc), char, regnode);
- SAVEFREEPV(RExC_emit_start);
+ /* Initialize the string in the compiled pattern. This is so that there is
+ * something to output if necessary */
+ set_regex_pv(pRExC_state, Rx);
+
+ DEBUG_PARSE_r({
+ Perl_re_printf( aTHX_
+ "Starting parse and generation\n");
+ RExC_lastnum=0;
+ RExC_lastparse=NULL;
+ });
+
+ /* Allocate space and zero-initialize. Note, the two step process
+ of zeroing when in debug mode, thus anything assigned has to
+ happen after that */
+ if (! RExC_size) {
+
+ /* On the first pass of the parse, we guess how big this will be. Then
+ * we grow in one operation to that amount and then give it back. As
+ * we go along, we re-allocate what we need.
+ *
+ * XXX Currently the guess is essentially that the pattern will be an
+ * EXACT node with one byte input, one byte output. This is crude, and
+ * better heuristics are welcome.
+ *
+ * On any subsequent passes, we guess what we actually computed in the
+ * latest earlier pass. Such a pass probably didn't complete so is
+ * missing stuff. We could improve those guesses by knowing where the
+ * parse stopped, and use the length so far plus apply the above
+ * assumption to what's left. */
+ RExC_size = STR_SZ(RExC_end - RExC_start);
+ }
+
+ Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
+ if ( RExC_rxi == NULL )
+ FAIL("Regexp out of space");
+
+ Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
+ RXi_SET( RExC_rx, RExC_rxi );
+
+ /* We start from 0 (over from 0 in the case this is a reparse. The first
+ * node parsed will give back any excess memory we have allocated so far).
+ * */
+ RExC_size = 0;
+
+ /* non-zero initialization begins here */
+ RExC_rx->engine= eng;
+ RExC_rx->extflags = rx_flags;
+ RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
+
+ if (pm_flags & PMf_IS_QR) {
+ RExC_rxi->code_blocks = pRExC_state->code_blocks;
+ if (RExC_rxi->code_blocks) {
+ RExC_rxi->code_blocks->refcnt++;
+ }
+ }
+
+ RExC_rx->intflags = 0;
+
+ RExC_flags = rx_flags; /* don't let top level (?i) bleed */
+ RExC_parse = exp;
/* This NUL is guaranteed because the pattern comes from an SV*, and the sv
* code makes sure the final byte is an uncounted NUL. But should this
* etc. So it is worth noting. */
assert(*RExC_end == '\0');
- DEBUG_PARSE_r(
- Perl_re_printf( aTHX_ "Starting first pass (sizing)\n");
- RExC_lastnum=0;
- RExC_lastparse=NULL;
- );
+ RExC_naughty = 0;
+ RExC_npar = 1;
+ RExC_emit_start = RExC_rxi->program;
+ pRExC_state->code_index = 0;
+
+ *((char*) RExC_emit_start) = (char) REG_MAGIC;
+ RExC_emit = 1;
+
+ /* Do the parse */
+ if (reg(pRExC_state, 0, &flags, 1)) {
+
+ /* Success!, But if RExC_total_parens < 0, we need to redo the parse
+ * knowing how many parens there actually are */
+ if (RExC_total_parens < 0) {
+ flags |= RESTART_PARSE;
+ }
+
+ /* We have that number in RExC_npar */
+ RExC_total_parens = RExC_npar;
+ }
+ else if (! MUST_RESTART(flags)) {
+ ReREFCNT_dec(Rx);
+ Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
+ }
+
+ /* Here, we either have success, or we have to redo the parse for some reason */
+ if (MUST_RESTART(flags)) {
- if (reg(pRExC_state, 0, &flags, 1) == 0) {
/* It's possible to write a regexp in ascii that represents Unicode
codepoints outside of the byte range, such as via \x{100}. If we
detect such a sequence we have to convert the entire pattern to utf8
at least some part of the pattern, and therefore must convert the whole
thing.
-- dmq */
- if (MUST_RESTART(flags)) {
- if (flags & NEED_UTF8) {
- S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
- pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
- DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
- }
- else {
- DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
+ if (flags & NEED_UTF8) {
+
+ /* We have stored the offset of the final warning output so far.
+ * That must be adjusted. Any variant characters between the start
+ * of the pattern and this warning count for 2 bytes in the final,
+ * so just add them again */
+ if (UNLIKELY(RExC_latest_warn_offset > 0)) {
+ RExC_latest_warn_offset +=
+ variant_under_utf8_count((U8 *) exp, (U8 *) exp
+ + RExC_latest_warn_offset);
}
+ S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
+ pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
+ DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
+ }
+ else {
+ DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
+ }
- goto redo_parse;
+ if (RExC_total_parens > 0) {
+ /* Make enough room for all the known parens, and zero it */
+ Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
+ Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
+ RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
+
+ Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
+ Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
+ }
+ else { /* Parse did not complete. Reinitialize the parentheses
+ structures */
+ RExC_total_parens = 0;
+ if (RExC_open_parens) {
+ Safefree(RExC_open_parens);
+ RExC_open_parens = NULL;
+ }
+ if (RExC_close_parens) {
+ Safefree(RExC_close_parens);
+ RExC_close_parens = NULL;
+ }
}
- Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile for sizing pass, flags=%#" UVxf, (UV) flags);
- }
- DEBUG_PARSE_r({
- Perl_re_printf( aTHX_
- "Required size %" IVdf " nodes\n"
- "Starting second pass (creation)\n",
- (IV)RExC_size);
- RExC_lastnum=0;
- RExC_lastparse=NULL;
- });
+ /* Clean up what we did in this parse */
+ SvREFCNT_dec_NN(RExC_rx_sv);
- /* The first pass could have found things that force Unicode semantics */
- if ((RExC_utf8 || RExC_uni_semantics)
- && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
- {
- set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
+ goto redo_parse;
}
- /* Small enough for pointer-storage convention?
- If extralen==0, this means that we will not need long jumps. */
- if (RExC_size >= 0x10000L && RExC_extralen)
- RExC_size += RExC_extralen;
- else
- RExC_extralen = 0;
- if (RExC_whilem_seen > 15)
- RExC_whilem_seen = 15;
+ /* Here, we have successfully parsed and generated the pattern's program
+ * for the regex engine. We are ready to finish things up and look for
+ * optimizations. */
- /* Allocate space and zero-initialize. Note, the two step process
- of zeroing when in debug mode, thus anything assigned has to
- happen after that */
- RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
- RExC_rx = ReANY(Rx);
- Newxc(RExC_rxi, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
- char, regexp_internal);
- if ( RExC_rx == NULL || RExC_rxi == NULL )
- FAIL("Regexp out of space");
-#ifdef DEBUGGING
- /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
- Zero(RExC_rxi, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
- char);
-#else
- /* bulk initialize base fields with 0. */
- Zero(RExC_rxi, sizeof(regexp_internal), char);
-#endif
-
- /* non-zero initialization begins here */
- RXi_SET( RExC_rx, RExC_rxi );
- RExC_rx->engine= eng;
- RExC_rx->extflags = rx_flags;
- RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
+ SetProgLen(RExC_rxi,RExC_size);
- if (pm_flags & PMf_IS_QR) {
- RExC_rxi->code_blocks = pRExC_state->code_blocks;
- if (RExC_rxi->code_blocks)
- RExC_rxi->code_blocks->refcnt++;
+ /* The values for the two variables below are now immutable, we can add
+ * them to the list to free without overwhelming it */
+ if (RExC_open_parens) {
+ SAVEFREEPV(RExC_open_parens);
+ }
+ if (RExC_close_parens) {
+ SAVEFREEPV(RExC_close_parens);
}
- /* Set up the string to compile, with correct modifiers, etc */
+ /* Update the string to compile, with correct modifiers, etc */
set_regex_pv(pRExC_state, Rx);
- RExC_rx->intflags = 0;
- RExC_total_parens = RExC_npar;
- RExC_rx->nparens = RExC_total_parens - 1; /* set early to validate backrefs */
+ RExC_rx->nparens = RExC_total_parens - 1;
+
+ /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
+ if (RExC_whilem_seen > 15)
+ RExC_whilem_seen = 15;
+
+ DEBUG_PARSE_r({
+ Perl_re_printf( aTHX_
+ "Required size %" IVdf " nodes\n", (IV)RExC_size);
+ RExC_lastnum=0;
+ RExC_lastparse=NULL;
+ });
- /* Useful during FAIL. */
#ifdef RE_TRACK_PATTERN_OFFSETS
- Newxz(RExC_offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
"%s %" UVuf " bytes for offset annotations.\n",
RExC_offsets ? "Got" : "Couldn't get",
(UV)((2*RExC_size+1) * sizeof(U32))));
#endif
- SetProgLen(RExC_rxi, RExC_size);
- RExC_rx_sv = Rx;
-
- /* Second pass: emit code. */
- RExC_pass1 = FALSE;
- RExC_flags = rx_flags; /* don't let top level (?i) bleed */
- RExC_pm_flags = pm_flags;
- RExC_parse = exp;
- RExC_end = exp + plen;
- RExC_naughty = 0;
- RExC_emit_start = RExC_rxi->program;
- RExC_emit = 1;
- RExC_emit_bound = RExC_rxi->program + RExC_size + 1;
- pRExC_state->code_index = 0;
-
- *((char*) RExC_emit_start) = (char) REG_MAGIC;
- /* setup various meta data about recursion, this all requires
- * RExC_npar to be correctly set, and a bit later on we clear it */
- if (RExC_seen & REG_RECURSE_SEEN) {
- DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
- "%*s%*s Setting up open/close parens\n",
- 22, "| |", (int)(0 * 2 + 1), ""));
-
- /* setup RExC_open_parens, which holds the address of each
- * OPEN tag, and to make things simpler for the 0 index
- * the start of the program - this is used later for offsets */
- Newxz(RExC_open_parens, RExC_npar, regnode_offset);
- SAVEFREEPV(RExC_open_parens);
- RExC_open_parens[0] = RExC_emit;
- /* setup RExC_close_parens, which holds the address of each
- * CLOSE tag, and to make things simpler for the 0 index
- * the end of the program - this is used later for offsets */
- Newxz(RExC_close_parens, RExC_npar, regnode_offset);
- SAVEFREEPV(RExC_close_parens);
- /* we dont know where end op starts yet, so we dont
- * need to set RExC_close_parens[0] like we do RExC_open_parens[0] above */
-
- /* Note, RExC_npar is 1 + the number of parens in a pattern.
- * So its 1 if there are no parens. */
- RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
- ((RExC_npar & 0x07) != 0);
- Newx(RExC_study_chunk_recursed,
- RExC_study_chunk_recursed_bytes * RExC_npar, U8);
- SAVEFREEPV(RExC_study_chunk_recursed);
- }
- RExC_npar = 1;
- if (reg(pRExC_state, 0, &flags, 1) == 0) {
- ReREFCNT_dec(Rx);
- Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile for generation pass, flags=%#" UVxf, (UV) flags);
- }
DEBUG_OPTIMISE_r(
Perl_re_printf( aTHX_ "Starting post parse optimization\n");
);
SAVEFREEPV(RExC_recurse);
}
+ if (RExC_seen & REG_RECURSE_SEEN) {
+ /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
+ * So its 1 if there are no parens. */
+ RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
+ ((RExC_total_parens & 0x07) != 0);
+ Newx(RExC_study_chunk_recursed,
+ RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
+ SAVEFREEPV(RExC_study_chunk_recursed);
+ }
+
reStudy:
RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
DEBUG_r(
character */
vFAIL("Group name must start with a non-digit word character");
}
- if ( flags ) {
sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
SVs_TEMP | (UTF ? SVf_UTF8 : 0));
- if ( flags == REG_RSN_RETURN_NAME)
- return sv_name;
- else if (flags==REG_RSN_RETURN_DATA) {
- HE *he_str = NULL;
- SV *sv_dat = NULL;
- if ( ! sv_name ) /* should not happen*/
- Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
- if (RExC_paren_names)
- he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
- if ( he_str )
- sv_dat = HeVAL(he_str);
- if ( ! sv_dat ) /* Didn't find group */
+ if ( flags == REG_RSN_RETURN_NAME)
+ return sv_name;
+ else if (flags==REG_RSN_RETURN_DATA) {
+ HE *he_str = NULL;
+ SV *sv_dat = NULL;
+ if ( ! sv_name ) /* should not happen*/
+ Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
+ if (RExC_paren_names)
+ he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
+ if ( he_str )
+ sv_dat = HeVAL(he_str);
+ if ( ! sv_dat ) { /* Didn't find group */
+
+ /* It might be a forward reference; we can't fail until we
+ * know, by completing the parse to get all the groups, and
+ * then reparsing */
+ if (RExC_total_parens > 0) {
vFAIL("Reference to nonexistent named group");
- return sv_dat;
- }
- else {
- Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
- (unsigned long) flags);
+ }
+ else {
+ REQUIRE_PARENS_PASS;
+ }
}
- NOT_REACHED; /* NOTREACHED */
+ return sv_dat;
}
- return NULL;
+ else {
+ Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
+ (unsigned long) flags);
+ }
+ NOT_REACHED; /* NOTREACHED */
}
#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
} else \
Perl_re_printf( aTHX_ "%16s",""); \
\
- if (SIZE_ONLY) \
- num = RExC_size + 1; \
- else \
- num=REG_NODE_NUM(REGNODE_p(RExC_emit)); \
+ num=REG_NODE_NUM(REGNODE_p(RExC_emit)); \
if (RExC_lastnum!=num) \
Perl_re_printf( aTHX_ "|%4d", num); \
else \
NOT_REACHED; /*NOTREACHED*/
case ONCE_PAT_MOD: /* 'o' */
case GLOBAL_PAT_MOD: /* 'g' */
- if (PASS2 && ckWARN(WARN_REGEXP)) {
+ if (ckWARN(WARN_REGEXP)) {
const I32 wflagbit = *RExC_parse == 'o'
? WASTED_O
: WASTED_G;
break;
case CONTINUE_PAT_MOD: /* 'c' */
- if (PASS2 && ckWARN(WARN_REGEXP)) {
+ if (ckWARN(WARN_REGEXP)) {
if (! (wastedflags & WASTED_C) ) {
wastedflags |= WASTED_GC;
/* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
regnode_offset ret;
char* name_start = RExC_parse;
U32 num = 0;
- SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY
- ? REG_RSN_RETURN_NULL
- : REG_RSN_RETURN_DATA);
+ SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
}
RExC_seen |= REG_VERBARG_SEEN;
- if ( ! SIZE_ONLY ) {
- if (start_arg) {
- SV *sv = newSVpvn( start_arg,
- RExC_parse - start_arg);
- ARG(REGNODE_p(ret)) = add_data( pRExC_state,
- STR_WITH_LEN("S"));
- RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
- FLAGS(REGNODE_p(ret)) = 1;
- } else {
- FLAGS(REGNODE_p(ret)) = 0;
- }
- if ( internal_argval != -1 )
- ARG2L_SET(REGNODE_p(ret), internal_argval);
+ if (start_arg) {
+ SV *sv = newSVpvn( start_arg,
+ RExC_parse - start_arg);
+ ARG(REGNODE_p(ret)) = add_data( pRExC_state,
+ STR_WITH_LEN("S"));
+ RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
+ FLAGS(REGNODE_p(ret)) = 1;
+ } else {
+ FLAGS(REGNODE_p(ret)) = 0;
}
+ if ( internal_argval != -1 )
+ ARG2L_SET(REGNODE_p(ret), internal_argval);
nextchar(pRExC_state);
return ret;
}
/* FALLTHROUGH */
case '\'': /* (?'...') */
name_start = RExC_parse;
- svname = reg_scan_name(pRExC_state,
- SIZE_ONLY /* reverse test from the others */
- ? REG_RSN_RETURN_NAME
- : REG_RSN_RETURN_NULL);
+ svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
if ( RExC_parse == name_start
|| RExC_parse >= RExC_end
|| *RExC_parse != paren)
vFAIL2("Sequence (?%c... not terminated",
paren=='>' ? '<' : paren);
}
- if (SIZE_ONLY) {
+ {
HE *he_str;
SV *sv_dat = NULL;
if (!svname) /* shouldn't happen */
buffers in alternations share the same numbers */
paren = ':';
after_freeze = freeze_paren = RExC_npar;
+
+ /* XXX This construct currently requires an extra pass.
+ * Investigation would be required to see if that could be
+ * changed */
+ REQUIRE_PARENS_PASS;
break;
case ':': /* (?:...) */
case '>': /* (?>...) */
FAIL("Sequence (?R) not terminated");
num = 0;
RExC_seen |= REG_RECURSE_SEEN;
+
+ /* XXX These constructs currently require an extra pass.
+ * It probably could be changed */
+ REQUIRE_PARENS_PASS;
+
*flagp |= POSTPONED;
goto gen_recurse_regop;
/*notreached*/
parse_start = RExC_parse - 1;
named_recursion:
{
- SV *sv_dat = reg_scan_name(pRExC_state,
- SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
- num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
+ SV *sv_dat = reg_scan_name(pRExC_state,
+ REG_RSN_RETURN_DATA);
+ num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
}
if (RExC_parse >= RExC_end || *RExC_parse != ')')
vFAIL("Sequence (?&... not terminated");
*/
num = RExC_npar + num;
if (num < 1) {
- RExC_parse++;
- vFAIL("Reference to nonexistent group");
+
+ /* It might be a forward reference; we can't fail until
+ * we know, by completing the parse to get all the
+ * groups, and then reparsing */
+ if (RExC_total_parens > 0) {
+ RExC_parse++;
+ vFAIL("Reference to nonexistent group");
+ }
+ else {
+ REQUIRE_PARENS_PASS;
+ }
}
} else if ( paren == '+' ) {
num = RExC_npar + num - 1;
*/
ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
- if (!SIZE_ONLY) {
- if (num > (I32)RExC_rx->nparens) {
- RExC_parse++;
- vFAIL("Reference to nonexistent group");
- }
- RExC_recurse_count++;
- DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
- "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
- 22, "| |", (int)(depth * 2 + 1), "",
- (UV)ARG(REGNODE_p(ret)),
- (IV)ARG2L(REGNODE_p(ret))));
+ if (num >= RExC_npar) {
+
+ /* It might be a forward reference; we can't fail until we
+ * know, by completing the parse to get all the groups, and
+ * then reparsing */
+ if (RExC_total_parens > 0) {
+ if (num >= RExC_total_parens) {
+ RExC_parse++;
+ vFAIL("Reference to nonexistent group");
+ }
+ }
+ else {
+ REQUIRE_PARENS_PASS;
+ }
}
+ RExC_recurse_count++;
+ DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
+ "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
+ 22, "| |", (int)(depth * 2 + 1), "",
+ (UV)ARG(REGNODE_p(ret)),
+ (IV)ARG2L(REGNODE_p(ret))));
RExC_seen |= REG_RECURSE_SEEN;
Set_Node_Length(REGNODE_p(ret),
/* this is a pre-compiled code block (?{...}) */
cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
RExC_parse = RExC_start + cb->end;
- if (!SIZE_ONLY) {
o = cb->block;
- if (cb->src_regex) {
- n = add_data(pRExC_state, STR_WITH_LEN("rl"));
- RExC_rxi->data->data[n] =
- (void*)SvREFCNT_inc((SV*)cb->src_regex);
- RExC_rxi->data->data[n+1] = (void*)o;
- }
- else {
- n = add_data(pRExC_state,
- (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
- RExC_rxi->data->data[n] = (void*)o;
- }
- }
+ if (cb->src_regex) {
+ n = add_data(pRExC_state, STR_WITH_LEN("rl"));
+ RExC_rxi->data->data[n] =
+ (void*)SvREFCNT_inc((SV*)cb->src_regex);
+ RExC_rxi->data->data[n+1] = (void*)o;
+ }
+ else {
+ n = add_data(pRExC_state,
+ (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
+ RExC_rxi->data->data[n] = (void*)o;
+ }
pRExC_state->code_index++;
nextchar(pRExC_state);
* return value */
RExC_flags & RXf_PMf_COMPILETIME
);
- if (!SIZE_ONLY) {
- FLAGS(REGNODE_p(ret)) = 2;
- }
+ FLAGS(REGNODE_p(ret)) = 2;
REGTAIL(pRExC_state, ret, eval);
/* deal with the length of this later - MJD */
return ret;
regnode_offset tail;
ret = reg_node(pRExC_state, LOGICAL);
- if (!SIZE_ONLY)
- FLAGS(REGNODE_p(ret)) = 1;
+ FLAGS(REGNODE_p(ret)) = 1;
tail = reg(pRExC_state, 1, &flag, depth+1);
RETURN_FAIL_ON_RESTART(flag, flagp);
char ch = RExC_parse[0] == '<' ? '>' : '\'';
char *name_start= RExC_parse++;
U32 num = 0;
- SV *sv_dat=reg_scan_name(pRExC_state,
- SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
+ SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
if ( RExC_parse == name_start
|| RExC_parse >= RExC_end
|| *RExC_parse != ch)
SV *sv_dat;
RExC_parse++;
sv_dat = reg_scan_name(pRExC_state,
- SIZE_ONLY
- ? REG_RSN_RETURN_NULL
- : REG_RSN_RETURN_DATA);
-
- /* we should only have a false sv_dat when
- * SIZE_ONLY is true, and we always have false
- * sv_dat when SIZE_ONLY is true.
- * reg_scan_name() will VFAIL() if the name is
- * unknown when SIZE_ONLY is false, and otherwise
- * will return something, and when SIZE_ONLY is
- * true, reg_scan_name() just parses the string,
- * and doesnt return anything. (in theory) */
- assert(SIZE_ONLY ? !sv_dat : !!sv_dat);
-
+ REG_RSN_RETURN_DATA);
if (sv_dat)
parno = 1 + *((I32 *)SvPVX(sv_dat));
}
}
else
REGTAIL(pRExC_state, ret, ender);
+#if 0 /* Removing this doesn't cause failures in the test suite -- khw */
RExC_size++; /* XXX WHY do we need this?!!
For large programs it seems to be required
but I can't figure out why. -- dmq*/
+#endif
return ret;
}
RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
capturing_parens:
parno = RExC_npar;
RExC_npar++;
+ if (RExC_total_parens <= 0) {
+ /* If we are in our first pass through (and maybe only pass),
+ * we need to allocate memory for the capturing parentheses
+ * data structures. Since we start at npar=1, when it reaches
+ * 2, for the first time it has something to put in it. Above
+ * 2 means we extend what we already have */
+ if (RExC_npar == 2) {
+ /* setup RExC_open_parens, which holds the address of each
+ * OPEN tag, and to make things simpler for the 0 index the
+ * start of the program - this is used later for offsets */
+ Newxz(RExC_open_parens, RExC_npar, regnode_offset);
+ RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
+
+ /* setup RExC_close_parens, which holds the address of each
+ * CLOSE tag, and to make things simpler for the 0 index
+ * the end of the program - this is used later for offsets
+ * */
+ Newxz(RExC_close_parens, RExC_npar, regnode_offset);
+ /* we dont know where end op starts yet, so we dont need to
+ * set RExC_close_parens[0] like we do RExC_open_parens[0]
+ * above */
+ }
+ else {
+ Renew(RExC_open_parens, RExC_npar, regnode_offset);
+ Zero(RExC_open_parens + RExC_npar - 1, 1, regnode_offset);
+
+ Renew(RExC_close_parens, RExC_npar, regnode_offset);
+ Zero(RExC_close_parens + RExC_npar - 1, 1, regnode_offset);
+ }
+ }
ret = reganode(pRExC_state, OPEN, parno);
- if (!SIZE_ONLY ){
- if (!RExC_nestroot)
- RExC_nestroot = parno;
- if (RExC_open_parens && !RExC_open_parens[parno])
- {
- DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
- "%*s%*s Setting open paren #%" IVdf " to %d\n",
- 22, "| |", (int)(depth * 2 + 1), "",
- (IV)parno, REG_NODE_NUM(REGNODE_p(ret))));
- RExC_open_parens[parno]= ret;
- }
- }
+ if (!RExC_nestroot)
+ RExC_nestroot = parno;
+ if (RExC_open_parens && !RExC_open_parens[parno])
+ {
+ DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
+ "%*s%*s Setting open paren #%" IVdf " to %d\n",
+ 22, "| |", (int)(depth * 2 + 1), "",
+ (IV)parno, REG_NODE_NUM(REGNODE_p(ret))));
+ RExC_open_parens[parno]= ret;
+ }
Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
Set_Node_Offset_To_R(br, parse_start-RExC_start);
}
have_branch = 1;
- if (SIZE_ONLY)
- RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
}
else if (paren == ':') {
*flagp |= flags&SIMPLE;
REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
ender);
}
- if (SIZE_ONLY)
- RExC_extralen += 2; /* Account for LONGJMP. */
nextchar(pRExC_state);
if (freeze_paren) {
if (RExC_npar > after_freeze)
break;
case 0:
ender = reg_node(pRExC_state, END);
- if (!SIZE_ONLY) {
- assert(!RExC_end_op); /* there can only be one! */
- RExC_end_op = REGNODE_p(ender);
- if (RExC_close_parens) {
- DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
- "%*s%*s Setting close paren #0 (END) to %d\n",
- 22, "| |", (int)(depth * 2 + 1), "",
- REG_NODE_NUM(REGNODE_p(ender))));
+ assert(!RExC_end_op); /* there can only be one! */
+ RExC_end_op = REGNODE_p(ender);
+ if (RExC_close_parens) {
+ DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
+ "%*s%*s Setting close paren #0 (END) to %d\n",
+ 22, "| |", (int)(depth * 2 + 1), "",
+ REG_NODE_NUM(REGNODE_p(ender))));
- RExC_close_parens[0]= ender;
- }
+ RExC_close_parens[0]= ender;
}
break;
}
- DEBUG_PARSE_r(if (!SIZE_ONLY) {
+ DEBUG_PARSE_r(
DEBUG_PARSE_MSG("lsbr");
regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
(IV)REG_NODE_NUM(REGNODE_p(ender)),
(IV)(ender - lastbr)
);
- });
+ );
REGTAIL(pRExC_state, lastbr, ender);
- if (have_branch && !SIZE_ONLY) {
+ if (have_branch) {
char is_nothing= 1;
if (depth==1)
RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
br= PL_regkind[OP(ret_as_regnode)] != BRANCH
? regnext(ret_as_regnode)
: ret_as_regnode;
- DEBUG_PARSE_r(if (!SIZE_ONLY) {
+ DEBUG_PARSE_r(
DEBUG_PARSE_MSG("NADA");
regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
NULL, pRExC_state);
(IV)REG_NODE_NUM(REGNODE_p(ender)),
(IV)(ender - ret)
);
- });
+ );
OP(br)= NOTHING;
if (OP(REGNODE_p(ender)) == TAIL) {
NEXT_OFF(br)= 0;
}
}
- if (!first && SIZE_ONLY)
- RExC_extralen += 1; /* BRANCHJ */
-
*flagp = WORST; /* Tentatively. */
skip_to_be_ignored_text(pRExC_state, &RExC_parse,
/* FIXME adding one for every branch after the first is probably
* excessive now we have TRIE support. (hv) */
MARK_NAUGHTY(1);
+ if ( chain > (SSize_t) BRANCH_MAX_OFFSET
+ && ! RExC_use_BRANCHJ)
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
REGTAIL(pRExC_state, chain, latest);
}
chain = latest;
if (max < min) { /* If can't match, warn and optimize to fail
unconditionally */
reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
- if (PASS2) {
- ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
- NEXT_OFF(REGNODE_p(orig_emit)) =
- regarglen[OPFAIL] + NODE_STEP_REGNODE;
- }
+ ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
+ NEXT_OFF(REGNODE_p(orig_emit)) =
+ regarglen[OPFAIL] + NODE_STEP_REGNODE;
return ret;
}
else if (min == max && *RExC_parse == '?')
NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to
LONGJMP. */
REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
- if (SIZE_ONLY)
- RExC_whilem_seen++, RExC_extralen += 3;
+ RExC_whilem_seen++;
MARK_NAUGHTY_EXP(1, 4); /* compound interest */
}
FLAGS(REGNODE_p(ret)) = 0;
*flagp = WORST;
if (max > 0)
*flagp |= HASWIDTH;
- if (!SIZE_ONLY) {
- ARG1_SET(REGNODE_p(ret), (U16)min);
- ARG2_SET(REGNODE_p(ret), (U16)max);
- }
+ ARG1_SET(REGNODE_p(ret), (U16)min);
+ ARG2_SET(REGNODE_p(ret), (U16)max);
if (max == REG_INFTY)
RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
goto do_curly;
}
nest_check:
- if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
+ if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
ckWARN2reg(RExC_parse,
"%" UTF8f " matches null string many times",
UTF8fARG(UTF, (RExC_parse >= origparse
PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
- /* Don't bother to check for downgrading in PASS1, as it doesn't make any
- * sizing difference, and is extra work that is thrown away */
- if (downgradable && ! PASS2) {
- downgradable = FALSE;
- }
-
if (! len_passed_in) {
if (UTF) {
if (UVCHR_IS_INVARIANT(code_point)) {
if (LOC || ! FOLD) { /* /l defers folding until runtime */
*character = (U8) code_point;
}
- else { /* Here is /i and not /l. (toFOLD() is defined on just
- ASCII, which isn't the same thing as INVARIANT on
- EBCDIC, but it works there, as the extra invariants
- fold to themselves) */
+ else { /* Here is /i and not /l. */
*character = toFOLD((U8) code_point);
/* We can downgrade to an EXACT node if this character
}
}
- if (SIZE_ONLY) {
- RExC_size += STR_SZ(len);
+ if (downgradable) {
+ change_engine_size(pRExC_state, STR_SZ(len));
}
- else {
- RExC_emit += STR_SZ(len);
- STR_LEN(REGNODE_p(node)) = len;
- if (! len_passed_in) {
- Copy((char *) character, STRING(REGNODE_p(node)), len, char);
- }
+
+ RExC_emit += STR_SZ(len);
+ STR_LEN(REGNODE_p(node)) = len;
+ if (! len_passed_in) {
+ Copy((char *) character, STRING(REGNODE_p(node)), len, char);
}
*flagp |= HASWIDTH;
*flagp |= SIMPLE;
}
- /* The OP may not be well defined in PASS1 */
- if (PASS2 && OP(REGNODE_p(node)) == EXACTFL) {
+ if (OP(REGNODE_p(node)) == EXACTFL) {
RExC_contains_locale = 1;
}
}
/* SBOL is shared with /^/ so we set the flags so we can tell
* /\A/ from /^/ in split. We check ret because first pass we
* have no regop struct to set the flags on. */
- if (PASS2)
- FLAGS(REGNODE_p(ret)) = 1;
+ FLAGS(REGNODE_p(ret)) = 1;
*flagp |= SIMPLE;
goto finish_meta_pat;
case 'G':
*flagp |= SIMPLE;
if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
FLAGS(REGNODE_p(ret)) = TRADITIONAL_BOUND;
- if (PASS2 && op > BOUNDA) { /* /aa is same as /a */
+ if (op > BOUNDA) { /* /aa is same as /a */
OP(REGNODE_p(ret)) = BOUNDA;
}
}
RExC_parse = endbrace;
REQUIRE_UNI_RULES(flagp, 0);
- if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */
+ if (op >= BOUNDA) { /* /aa is same as /a */
OP(REGNODE_p(ret)) = BOUNDU;
length += 4;
}
}
- if (PASS2 && invert) {
+ if (invert) {
OP(REGNODE_p(ret)) += NBOUND - BOUND;
}
goto finish_meta_pat;
}
ret = reg_node(pRExC_state, op);
- if (! SIZE_ONLY) {
- FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg);
- }
+ FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg);
*flagp |= HASWIDTH|SIMPLE;
/* FALLTHROUGH */
vFAIL("Unterminated \\g{...} pattern");
RExC_parse++;
}
- if (!SIZE_ONLY) {
- if (num > (I32)RExC_rx->nparens)
- vFAIL("Reference to nonexistent group");
+ if (num >= (I32)RExC_npar) {
+
+ /* It might be a forward reference; we can't fail until we
+ * know, by completing the parse to get all the groups, and
+ * then reparsing */
+ if (RExC_total_parens > 0) {
+ if (num >= RExC_total_parens) {
+ vFAIL("Reference to nonexistent group");
+ }
+ }
+ else {
+ REQUIRE_PARENS_PASS;
+ }
}
RExC_sawback = 1;
ret = reganode(pRExC_state,
/* This allows us to fill a node with just enough spare so that if the final
* character folds, its expansion is guaranteed to fit */
#define MAX_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE)
- char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE+1];
char *s0;
U8 upper_parse = MAX_NODE_STRING_SIZE;
* created for the new category. */
U8 node_type = EXACT;
+ /* Assume node will be fully used; the excess is given back at the end */
+ Ptrdiff_t initial_size = STR_SZ(256);
+
bool next_is_quantifier;
char * oldp = NULL;
* which don't participate in folds with Latin1-range characters,
* as the latter's folds aren't known until runtime. (We don't
* need to figure this out until pass 2) */
- bool maybe_exactfu = PASS2;
+ bool maybe_exactfu = TRUE;
/* To see if RExC_uni_semantics changes during parsing of the node.
* */
bool uni_semantics_at_node_start;
- /* The node_type may change below, but since the size of the node
- * doesn't change, it works */
- ret = reg_node(pRExC_state, node_type);
+ /* Allocate an EXACT node. The node_type may change below to
+ * another EXACTish node, but since the size of the node doesn't
+ * change, it works */
+ ret = regnode_guts(pRExC_state, node_type, initial_size, "exact");
+ FILL_NODE(ret, node_type);
+ RExC_emit++;
- /* In pass1, folded, we use a temporary buffer instead of the
- * actual node, as the node doesn't exist yet */
- s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(REGNODE_p(ret));
+ s = STRING(REGNODE_p(ret));
s0 = s;
if (! FOLD) { /* The simple case, just append the literal */
- /* In the sizing pass, we need only the size of the
- * character we are appending, hence we can delay getting
- * its representation until PASS2. */
- if (SIZE_ONLY) {
- if (UTF && ! UVCHR_IS_INVARIANT(ender)) {
- const STRLEN unilen = UVCHR_SKIP(ender);
- s += unilen;
- added_len = unilen;
- }
- else {
- s++;
- }
- } else { /* PASS2 */
not_fold_common:
if (UTF && ! UVCHR_IS_INVARIANT(ender)) {
U8 * new_s = uvchr_to_utf8((U8*)s, ender);
else {
*(s++) = (char) ender;
}
- }
}
else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
loopdone: /* Jumped to when encounters something that shouldn't be
in the node */
+ change_engine_size(pRExC_state, - (initial_size - STR_SZ(len)));
+
/* I (khw) don't know if you can get here with zero length, but the
* old code handled this situation by creating a zero-length EXACT
* node. Might as well be NOTHING instead */
/* Position parse to next real character */
skip_to_be_ignored_text(pRExC_state, &RExC_parse,
FALSE /* Don't force to /x */ );
- if ( PASS2 && *RExC_parse == '{'
+ if ( *RExC_parse == '{'
&& OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
{
if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
const bool save_fold = FOLD; /* Temporary */
char *save_end, *save_parse; /* Temporaries */
const bool in_locale = LOC; /* we turn off /l during processing */
- AV* posix_warnings = NULL;
GET_RE_DEBUG_FLAGS_DECL;
* compile time values are valid in all runtime cases */
REQUIRE_UNI_RULES(flagp, 0);
- /* This will return only an ANYOF regnode, or (unlikely) something smaller
- * (such as EXACT). Thus we can skip most everything if just sizing. We
- * call regclass to handle '[]' so as to not have to reinvent its parsing
- * rules here (throwing away the size it computes each time). And, we exit
- * upon an unescaped ']' that isn't one ending a regclass. To do both
- * these things, we need to realize that something preceded by a backslash
- * is escaped, so we have to keep track of backslashes */
- if (SIZE_ONLY) {
- UV nest_depth = 0; /* how many nested (?[...]) constructs */
-
- while (RExC_parse < RExC_end) {
- SV* current = NULL;
-
- skip_to_be_ignored_text(pRExC_state, &RExC_parse,
- TRUE /* Force /x */ );
-
- switch (*RExC_parse) {
- case '(':
- if (RExC_parse[1] == '?' && RExC_parse[2] == '[')
- nest_depth++, RExC_parse+=2;
- /* FALLTHROUGH */
- default:
- break;
- case '\\':
- /* Skip past this, so the next character gets skipped, after
- * the switch */
- RExC_parse++;
- if (*RExC_parse == 'c') {
- /* Skip the \cX notation for control characters */
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
- }
- break;
-
- case '[':
- {
- /* See if this is a [:posix:] class. */
- bool is_posix_class = (OOB_NAMEDCLASS
- < handle_possible_posix(pRExC_state,
- RExC_parse + 1,
- NULL,
- NULL,
- TRUE /* checking only */));
- /* If it is a posix class, leave the parse pointer at the
- * '[' to fool regclass() into thinking it is part of a
- * '[[:posix:]]'. */
- if (! is_posix_class) {
- RExC_parse++;
- }
-
- /* regclass() can only return RESTART_PARSE and NEED_UTF8
- * if multi-char folds are allowed. */
- if (!regclass(pRExC_state, flagp, depth+1,
- is_posix_class, /* parse the whole char
- class only if not a
- posix class */
- FALSE, /* don't allow multi-char folds */
- TRUE, /* silence non-portable warnings. */
- TRUE, /* strict */
- FALSE, /* Require return to be an ANYOF */
- ¤t,
- &posix_warnings
- ))
- FAIL2("panic: regclass returned failure to handle_sets, "
- "flags=%#" UVxf, (UV) *flagp);
-
- /* function call leaves parse pointing to the ']', except
- * if we faked it */
- if (is_posix_class) {
- RExC_parse--;
- }
-
- SvREFCNT_dec(current); /* In case it returned something */
- break;
- }
-
- case ']':
- if (RExC_parse[1] == ')') {
- RExC_parse++;
- if (nest_depth--) break;
- node = reganode(pRExC_state, ANYOF, 0);
- nextchar(pRExC_state);
- Set_Node_Length(REGNODE_p(node),
- RExC_parse - oregcomp_parse + 1); /* MJD */
- if (in_locale) {
- set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
- }
-
- return node;
- }
- /* We output the messages even if warnings are off, because we'll fail
- * the very next thing, and these give a likely diagnosis for that */
- if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
- output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
- }
- RExC_parse++;
- vFAIL("Unexpected ']' with no following ')' in (?[...");
- }
-
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
- }
-
- /* We output the messages even if warnings are off, because we'll fail
- * the very next thing, and these give a likely diagnosis for that */
- if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
- output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
- }
-
- vFAIL("Syntax error in (?[...])");
- }
-
- /* Pass 2 only after this. */
ckWARNexperimental(RExC_parse,
WARN_EXPERIMENTAL__REGEX_SETS,
"The regex_sets feature is experimental");
skip_to_be_ignored_text(pRExC_state, &RExC_parse,
TRUE /* Force /x */ );
- if (RExC_parse >= RExC_end) {
- Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
+ if (RExC_parse >= RExC_end) { /* Fail */
+ break;
}
curchar = UCHARAT(RExC_parse);
"flags=%#" UVxf, (UV) *flagp);
}
+ if (! current) {
+ break;
+ }
+
/* function call leaves parse pointing to the ']', except if we
* faked it */
if (is_posix_class) {
case ')':
if (av_tindex_skip_len_mg(fence_stack) < 0) {
+ if (UCHARAT(RExC_parse - 1) == ']') {
+ break;
+ }
RExC_parse++;
vFAIL("Unexpected ')'");
}
default:
RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ if (RExC_parse >= RExC_end) {
+ break;
+ }
vFAIL("Unexpected character");
handle_operand:
RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
} /* End of loop parsing through the construct */
+ vFAIL("Syntax error in (?[...])");
+
done:
+
+ if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
+ if (RExC_parse < RExC_end) {
+ RExC_parse++;
+ }
+
+ vFAIL("Unexpected ']' with no following ')' in (?[...");
+ }
+
if (av_tindex_skip_len_mg(fence_stack) >= 0) {
vFAIL("Unmatched (");
}
PREPARE_TO_DIE;
}
}
- Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s",
+ if (TO_OUTPUT_WARNINGS(RExC_parse)) {
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s",
SvPVX(msg));
+ }
SvREFCNT_dec_NN(msg);
}
}
const regnode_offset orig_emit = RExC_emit; /* Save the original RExC_emit in
case we need to change the emitted regop to an EXACT. */
const char * orig_parse = RExC_parse;
- const SSize_t orig_size = RExC_size;
bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
/* This variable is used to mark where the end in the input is of something
char *not_posix_region_end = RExC_parse - 1;
AV* posix_warnings = NULL;
- const bool do_posix_warnings = return_posix_warnings
- || (PASS2 && ckWARN(WARN_REGEXP));
+ const bool do_posix_warnings = return_posix_warnings || ckWARN(WARN_REGEXP);
U8 op = END; /* The returned node-type, initialized to an impossible
one. */
U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
allow_multi_folds = FALSE;
#endif
- if (SIZE_ONLY) {
- listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
- }
- else {
- listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
- initial_listsv_len = SvCUR(listsv);
- SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
- }
+ listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
+ initial_listsv_len = SvCUR(listsv);
+ SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
¬_posix_region_end,
NULL,
TRUE /* checking only */);
- if (PASS2 && maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
+ if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
ckWARN4reg(not_posix_region_end,
"POSIX syntax [%c %c] belongs inside character classes%s",
*RExC_parse, *RExC_parse,
e = RExC_parse;
n = 1;
}
- if (!SIZE_ONLY) {
+ {
char* name = RExC_parse;
char* base_name; /* name after any packages are stripped */
char* lookup_name = NULL;
}
}
}
- } /* End of actually getting the values in pass 2 */
+ }
RExC_parse = e + 1;
namedclass = ANYOF_UNIPROP; /* no official name, but it's
RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
vFAIL("Need exactly 3 octal digits");
}
- else if (! SIZE_ONLY /* like \08, \178 */
- && numlen < 3
+ else if ( numlen < 3 /* like \08, \178 */
&& RExC_parse < RExC_end
&& isDIGIT(*RExC_parse)
&& ckWARN(WARN_REGEXP))
}
default:
/* Allow \_ to not give an error */
- if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
+ if (isWORDCHAR(value) && value != '_') {
if (strict) {
vFAIL2("Unrecognized escape \\%c in character class",
(int)value);
* literal, as is the character that began the false range, i.e.
* the 'a' in the examples */
if (range) {
- if (!SIZE_ONLY) {
- const int w = (RExC_parse >= rangebegin)
- ? RExC_parse - rangebegin
- : 0;
- if (strict) {
- vFAIL2utf8f(
- "False [] range \"%" UTF8f "\"",
- UTF8fARG(UTF, w, rangebegin));
- }
- else {
- ckWARN2reg(RExC_parse,
- "False [] range \"%" UTF8f "\"",
- UTF8fARG(UTF, w, rangebegin));
- cp_list = add_cp_to_invlist(cp_list, '-');
- cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
- prevvalue);
- }
- }
+ const int w = (RExC_parse >= rangebegin)
+ ? RExC_parse - rangebegin
+ : 0;
+ if (strict) {
+ vFAIL2utf8f(
+ "False [] range \"%" UTF8f "\"",
+ UTF8fARG(UTF, w, rangebegin));
+ }
+ else {
+ ckWARN2reg(RExC_parse,
+ "False [] range \"%" UTF8f "\"",
+ UTF8fARG(UTF, w, rangebegin));
+ cp_list = add_cp_to_invlist(cp_list, '-');
+ cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
+ prevvalue);
+ }
range = 0; /* this was not a true range */
element_count += 2; /* So counts for three values */
/* The above-Latin1 characters are not subject to locale rules.
* Just add them, in the second pass, to the
* unconditionally-matched list */
- if (! SIZE_ONLY) {
-
- /* Get the list of the above-Latin1 code points this
- * matches */
- _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
- PL_XPosix_ptrs[classnum],
-
- /* Odd numbers are complements, like
- * NDIGIT, NASCII, ... */
- namedclass % 2 != 0,
- &scratch_list);
- /* Checking if 'cp_list' is NULL first saves an extra
- * clone. Its reference count will be decremented at the
- * next union, etc, or if this is the only instance, at the
- * end of the routine */
- if (! cp_list) {
- cp_list = scratch_list;
- }
- else {
- _invlist_union(cp_list, scratch_list, &cp_list);
- SvREFCNT_dec_NN(scratch_list);
- }
- continue; /* Go get next character */
+
+ /* Get the list of the above-Latin1 code points this
+ * matches */
+ _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
+ PL_XPosix_ptrs[classnum],
+
+ /* Odd numbers are complements, like
+ * NDIGIT, NASCII, ... */
+ namedclass % 2 != 0,
+ &scratch_list);
+ /* Checking if 'cp_list' is NULL first saves an extra
+ * clone. Its reference count will be decremented at the
+ * next union, etc, or if this is the only instance, at the
+ * end of the routine */
+ if (! cp_list) {
+ cp_list = scratch_list;
}
+ else {
+ _invlist_union(cp_list, scratch_list, &cp_list);
+ SvREFCNT_dec_NN(scratch_list);
+ }
+ continue; /* Go get next character */
}
- else if (! SIZE_ONLY) {
+ else {
/* Here, not in pass1 (in that pass we skip calculating the
* contents of this class), and is not /l, or is a POSIX class
/* a bad range like \w-, [:word:]- ? */
if (namedclass > OOB_NAMEDCLASS) {
- if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
+ if (strict || ckWARN(WARN_REGEXP)) {
const int w = RExC_parse >= rangebegin
? RExC_parse - rangebegin
: 0;
w, w, rangebegin);
}
}
- if (!SIZE_ONLY) {
- cp_list = add_cp_to_invlist(cp_list, '-');
- }
+ cp_list = add_cp_to_invlist(cp_list, '-');
element_count++;
} else
range = 1; /* yeah, it's a range! */
}
}
- if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
+ if (strict && ckWARN(WARN_REGEXP)) {
if (range) {
/* If the range starts above 255, everything is portable and
}
/* Deal with this element of the class */
- if (! SIZE_ONLY) {
#ifndef EBCDIC
- cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
- prevvalue, value);
+ cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
+ prevvalue, value);
#else
- /* On non-ASCII platforms, for ranges that span all of 0..255, and
- * ones that don't require special handling, we can just add the
- * range like we do for ASCII platforms */
- if ((UNLIKELY(prevvalue == 0) && value >= 255)
- || ! (prevvalue < 256
- && (unicode_range
- || (! non_portable_endpoint
- && ((isLOWER_A(prevvalue) && isLOWER_A(value))
- || (isUPPER_A(prevvalue)
- && isUPPER_A(value)))))))
- {
- cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
- prevvalue, value);
+ /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
+ * that don't require special handling, we can just add the range like
+ * we do for ASCII platforms */
+ if ((UNLIKELY(prevvalue == 0) && value >= 255)
+ || ! (prevvalue < 256
+ && (unicode_range
+ || (! non_portable_endpoint
+ && ((isLOWER_A(prevvalue) && isLOWER_A(value))
+ || (isUPPER_A(prevvalue)
+ && isUPPER_A(value)))))))
+ {
+ cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
+ prevvalue, value);
+ }
+ else {
+ /* Here, requires special handling. This can be because it is a
+ * range whose code points are considered to be Unicode, and so
+ * must be individually translated into native, or because its a
+ * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
+ * EBCDIC, but we have defined them to include only the "expected"
+ * upper or lower case ASCII alphabetics. Subranges above 255 are
+ * the same in native and Unicode, so can be added as a range */
+ U8 start = NATIVE_TO_LATIN1(prevvalue);
+ unsigned j;
+ U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
+ for (j = start; j <= end; j++) {
+ cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
}
- else {
- /* Here, requires special handling. This can be because it is
- * a range whose code points are considered to be Unicode, and
- * so must be individually translated into native, or because
- * its a subrange of 'A-Z' or 'a-z' which each aren't
- * contiguous in EBCDIC, but we have defined them to include
- * only the "expected" upper or lower case ASCII alphabetics.
- * Subranges above 255 are the same in native and Unicode, so
- * can be added as a range */
- U8 start = NATIVE_TO_LATIN1(prevvalue);
- unsigned j;
- U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
- for (j = start; j <= end; j++) {
- cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
- }
- if (value > 255) {
- cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
- 256, value);
- }
+ if (value > 255) {
+ cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
+ 256, value);
}
-#endif
}
+#endif
range = 0; /* this range (if it was one) is done now */
} /* End of loop through all the text within the brackets */
* the parse */
const char * cur_parse = RExC_parse;
RExC_parse = (char *)orig_parse;
- if (PL_regkind[op] == POSIXD) {
- if (op == POSIXL) {
- RExC_contains_locale = 1;
- }
- if (invert) {
- op += NPOSIXD - POSIXD;
- }
+ if (PL_regkind[op] == POSIXD) {
+ if (op == POSIXL) {
+ RExC_contains_locale = 1;
+ }
+ if (invert) {
+ op += NPOSIXD - POSIXD;
}
+ }
ret = reg_node(pRExC_state, op);
if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
- if (! SIZE_ONLY) {
- FLAGS(REGNODE_p(ret)) = arg;
- }
+ FLAGS(REGNODE_p(ret)) = arg;
*flagp |= HASWIDTH|SIMPLE;
}
else if (PL_regkind[op] == EXACT) {
}
}
- /* Assume we are going to generate an ANYOF-type node. */
- op = (posixl)
- ? ANYOFPOSIXL
- : (LOC)
- ? ANYOFL
- : ANYOF;
- ret = reganode(pRExC_state, op, 0);
-
- if (SIZE_ONLY) {
- return ret;
- }
-
- /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
-
/* If folding, we calculate all characters that could fold to or from the
* ones already on the list */
if (cp_foldable_list) {
}
if (ret_invlist) {
- assert(cp_list);
-
*ret_invlist = cp_list;
SvREFCNT_dec(swash);
- /* Discard the generated node */
- if (SIZE_ONLY) {
- RExC_size = orig_size;
- }
- else {
- RExC_emit = orig_emit;
- }
- return orig_emit;
+ return RExC_emit;
}
/* Some character classes are equivalent to other nodes. Such nodes take
}
/* It's going to be an ANYOF node. */
- OP(REGNODE_p(ret)) = (use_anyofd)
+ op = (use_anyofd)
? ANYOFD
: ((posixl)
? ANYOFPOSIXL
: ((LOC)
? ANYOFL
: ANYOF));
+ ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
+ FILL_NODE(ret, op); /* We set the argument later */
+ RExC_emit += 1 + regarglen[op];
ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
/* Here, <cp_list> contains all the code points we can determine at
PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
RExC_size += size;
+
+ Renewc(RExC_rxi,
+ sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
+ /* +1 for REG_MAGIC */
+ char,
+ regexp_internal);
+ if ( RExC_rxi == NULL )
+ FAIL("Regexp out of space");
+ RXi_SET(RExC_rx, RExC_rxi);
+
+ RExC_emit_start = RExC_rxi->program;
+ if (size > 0) {
+ Zero(REGNODE_p(RExC_emit), size, regnode);
+ }
+
+#ifdef RE_TRACK_PATTERN_OFFSETS
+ Renew(RExC_offsets, 2*RExC_size+1, U32);
+ if (size > 0) {
+ Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
+ }
+ RExC_offsets[0] = 2*RExC_size+1;
+#endif
}
STATIC regnode_offset
PERL_ARGS_ASSERT_REGNODE_GUTS;
- assert(extra_size >= regarglen[op]);
-
- if (SIZE_ONLY) {
- SIZE_ALIGN(RExC_size);
+ SIZE_ALIGN(RExC_size);
change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
- return(ret);
- }
- if (REGNODE_p(RExC_emit) >= RExC_emit_bound)
- Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
- op, (void*)REGNODE_p(RExC_emit), (void*)RExC_emit_bound);
-
NODE_ALIGN_FILL(REGNODE_p(ret));
#ifndef RE_TRACK_PATTERN_OFFSETS
PERL_UNUSED_ARG(name);
+ PERL_UNUSED_ARG(op);
#else
+ assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
+
if (RExC_offsets) { /* MJD */
MJD_OFFSET_DEBUG(
("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
{
const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
+ regnode_offset ptr = ret;
PERL_ARGS_ASSERT_REG_NODE;
assert(regarglen[op] == 0);
- if (PASS2) {
- regnode_offset ptr = ret;
- FILL_ADVANCE_NODE(ptr, op);
- RExC_emit = ptr;
- }
+ FILL_ADVANCE_NODE(ptr, op);
+ RExC_emit = ptr;
return(ret);
}
S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
{
const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
+ regnode_offset ptr = ret;
PERL_ARGS_ASSERT_REGANODE;
/* ANYOF are special cased to allow non-length 1 args */
- assert(regarglen[op] == 1 || PL_regkind[op] == ANYOF);
+ assert(regarglen[op] == 1);
- if (PASS2) {
- regnode_offset ptr = ret;
- FILL_ADVANCE_NODE_ARG(ptr, op, arg);
- RExC_emit = ptr;
- }
+ FILL_ADVANCE_NODE_ARG(ptr, op, arg);
+ RExC_emit = ptr;
return(ret);
}
/* emit a node with U32 and I32 arguments */
const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
+ regnode_offset ptr = ret;
PERL_ARGS_ASSERT_REG2LANODE;
assert(regarglen[op] == 2);
- if (PASS2) {
- regnode_offset ptr = ret;
- FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
- RExC_emit = ptr;
- }
+ FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
+ RExC_emit = ptr;
return(ret);
}
* IMPORTANT NOTE - it is the *callers* responsibility to correctly
* set up NEXT_OFF() of the inserted node if needed. Something like this:
*
-* reginsert(pRExC, OPFAIL, orig_emit, depth+1);
-* if (PASS2)
-* NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
+* reginsert(pRExC, OPFAIL, orig_emit, depth+1);
+* NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
*
* ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
*/
PERL_UNUSED_ARG(depth);
/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
- if (SIZE_ONLY) {
- RExC_size += size;
- return;
- }
assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
studying. If this is wrong then we need to adjust RExC_recurse
below like we do with RExC_open_parens/RExC_close_parens. */
+ change_engine_size(pRExC_state, (Ptrdiff_t) size);
src = REGNODE_p(RExC_emit);
RExC_emit += size;
dst = REGNODE_p(RExC_emit);
PERL_UNUSED_ARG(depth);
#endif
- if (SIZE_ONLY)
- return;
-
/* Find last node. */
scan = (regnode_offset) p;
for (;;) {
PERL_ARGS_ASSERT_REGTAIL_STUDY;
- if (SIZE_ONLY)
- return exact;
-
/* Find last node. */
scan = p;
PERL_ARGS_ASSERT_PREGFREE2;
+ if (! r)
+ return;
+
if (r->mother_re) {
ReREFCNT_dec(r->mother_re);
} else {
PERL_ARGS_ASSERT_REGFREE_INTERNAL;
+ if (! ri) {
+ return;
+ }
+
DEBUG_COMPILE_r({
if (!PL_colorset)
reginitcolors();