I32 whilem_seen; /* number of WHILEM in this expr */
regnode *emit_start; /* Start of emitted-code area */
regnode *emit_bound; /* First regnode outside of the allocated space */
- regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
+ regnode *emit; /* Code-emit pointer; if = &emit_dummy,
+ implies compiling, so don't emit */
+ regnode emit_dummy; /* placeholder for emit to point to */
I32 naughty; /* How bad is this pattern? */
I32 sawback; /* Did we see \1, ...? */
U32 seen;
#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
#endif
#define RExC_emit (pRExC_state->emit)
+#define RExC_emit_dummy (pRExC_state->emit_dummy)
#define RExC_emit_start (pRExC_state->emit_start)
#define RExC_emit_bound (pRExC_state->emit_bound)
#define RExC_naughty (pRExC_state->naughty)
#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
#define SCF_SEEN_ACCEPT 0x8000
+#define SCF_TRIE_DOING_RESTUDY 0x10000
#define UTF cBOOL(RExC_utf8)
#define Set_Cur_Node_Offset
#define Set_Node_Length_To_R(node,len)
#define Set_Node_Length(node,len)
-#define Set_Node_Cur_Length(node)
+#define Set_Node_Cur_Length(node,start)
#define Node_Offset(n)
#define Node_Length(n)
#define Set_Node_Offset_Length(node,offset,len)
#define Set_Node_Length(node,len) \
Set_Node_Length_To_R((node)-RExC_emit_start, len)
-#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
-#define Set_Node_Cur_Length(node) \
- Set_Node_Length(node, RExC_parse - parse_start)
+#define Set_Node_Cur_Length(node, start) \
+ Set_Node_Length(node, RExC_parse - start)
/* Get offsets and lengths */
#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
len = 0; \
} else { \
len = 1; \
- uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
+ uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \
skiplen = UNISKIP(uvc); \
foldlen -= skiplen; \
scan = foldbuf + skiplen; \
* this file makes sure that in EXACTFU nodes, the sharp s gets folded to
* 'ss', even if the pattern isn't UTF-8. This avoids the issues
* described in the next item.
- * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
- * 'ss' or not is not knowable at compile time. It will match iff the
- * target string is in UTF-8, unlike the EXACTFU nodes, where it always
- * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
- * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
- * described in item 3). An assumption that the optimizer part of
- * regexec.c (probably unwittingly) makes is that a character in the
- * pattern corresponds to at most a single character in the target string.
- * (And I do mean character, and not byte here, unlike other parts of the
- * documentation that have never been updated to account for multibyte
- * Unicode.) This assumption is wrong only in this case, as all other
- * cases are either 1-1 folds when no UTF-8 is involved; or is true by
- * virtue of having this file pre-fold UTF-8 patterns. I'm
- * reluctant to try to change this assumption, so instead the code punts.
- * This routine examines EXACTF nodes for the sharp s, and returns a
- * boolean indicating whether or not the node is an EXACTF node that
- * contains a sharp s. When it is true, the caller sets a flag that later
- * causes the optimizer in this file to not set values for the floating
- * and fixed string lengths, and thus avoids the optimizer code in
- * regexec.c that makes the invalid assumption. Thus, there is no
- * optimization based on string lengths for EXACTF nodes that contain the
- * sharp s. This only happens for /id rules (which means the pattern
- * isn't in UTF-8).
+ * 4) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
+ * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
+ * UTF-8 pattern.) An assumption that the optimizer part of regexec.c
+ * (probably unwittingly, in Perl_regexec_flags()) makes is that a
+ * character in the pattern corresponds to at most a single character in
+ * the target string. (And I do mean character, and not byte here, unlike
+ * other parts of the documentation that have never been updated to
+ * account for multibyte Unicode.) sharp s in EXACTF nodes can match the
+ * two character string 'ss'; in EXACTFA nodes it can match
+ * "\x{17F}\x{17F}". These violate the assumption, and they are the only
+ * instances where it is violated. I'm reluctant to try to change the
+ * assumption, as the code involved is impenetrable to me (khw), so
+ * instead the code here punts. This routine examines (when the pattern
+ * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
+ * boolean indicating whether or not the node contains a sharp s. When it
+ * is true, the caller sets a flag that later causes the optimizer in this
+ * file to not set values for the floating and fixed string lengths, and
+ * thus avoids the optimizer code in regexec.c that makes the invalid
+ * assumption. Thus, there is no optimization based on string lengths for
+ * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
+ * (The reason the assumption is wrong only in these two cases is that all
+ * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
+ * other folds to their expanded versions. We can't prefold sharp s to
+ * 'ss' in EXACTF nodes because we don't know at compile time if it
+ * actually matches 'ss' or not. It will match iff the target string is
+ * in UTF-8, unlike the EXACTFU nodes, where it always matches; and
+ * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8
+ * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
+ * but in a non-UTF8 pattern, folding it to that above-Latin1 string would
+ * require the pattern to be forced into UTF-8, the overhead of which we
+ * want to avoid.)
*/
#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
OP(scan) = EXACTFU_SS;
s += 2;
}
- else if (len == 6 /* len is the same in both ASCII and EBCDIC for these */
+ else if (len == 6 /* len is the same in both ASCII and EBCDIC
+ for these */
&& (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
COMBINING_DIAERESIS_UTF8
COMBINING_ACUTE_ACCENT_UTF8,
next_iteration: ;
}
}
- else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
+ else if (OP(scan) == EXACTFA) {
- /* Here, the pattern is not UTF-8. Look for the multi-char folds
- * that are all ASCII. As in the above case, EXACTFL and EXACTFA
- * nodes can't have multi-char folds to this range (and there are
- * no existing ones in the upper latin1 range). In the EXACTF
- * case we look also for the sharp s, which can be in the final
+ /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
+ * fold to the ASCII range (and there are no existing ones in the
+ * upper latin1 range). But, as outlined in the comments preceding
+ * this function, we need to flag any occurrences of the sharp s */
+ while (s < s_end) {
+ if (*s == LATIN_SMALL_LETTER_SHARP_S) {
+ *has_exactf_sharp_s = TRUE;
+ break;
+ }
+ s++;
+ continue;
+ }
+ }
+ else if (OP(scan) != EXACTFL) {
+
+ /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the
+ * multi-char folds that are all Latin1. (This code knows that
+ * there are no current multi-char folds possible with EXACTFL,
+ * relying on fold_grind.t to catch any errors if the very unlikely
+ * event happens that some get added in future Unicode versions.)
+ * As explained in the comments preceding this function, we look
+ * also for the sharp s in EXACTF nodes; it can be in the final
* position. Otherwise we can stop looking 1 byte earlier because
* have to find at least two characters for a multi-fold */
const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
}
if (!scan) /* It was not CURLYX, but CURLY. */
scan = next;
- if ( /* ? quantifier ok, except for (?{ ... }) */
- (next_is_eval || !(mincount == 0 && maxcount == 1))
+ if (!(flags & SCF_TRIE_DOING_RESTUDY)
+ /* ? quantifier ok, except for (?{ ... }) */
+ && (next_is_eval || !(mincount == 0 && maxcount == 1))
&& (minnext == 0) && (deltanext == 0)
&& data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
&& maxcount <= REG_INFTY/3) /* Complement check for big count */
NULL, NULL, rx_flags, 0);
}
+
+/* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
+ * blocks, recalculate the indices. Update pat_p and plen_p in-place to
+ * point to the realloced string and length.
+ *
+ * This is essentially a copy of Perl_bytes_to_utf8() with the code index
+ * stuff added */
+
+static void
+S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
+ char **pat_p, STRLEN *plen_p, int num_code_blocks)
+{
+ U8 *const src = (U8*)*pat_p;
+ U8 *dst;
+ int n=0;
+ STRLEN s = 0, d = 0;
+ bool do_end = 0;
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
+
+ Newx(dst, *plen_p * 2 + 1, U8);
+
+ while (s < *plen_p) {
+ const UV uv = NATIVE_TO_ASCII(src[s]);
+ if (UNI_IS_INVARIANT(uv))
+ dst[d] = (U8)UTF_TO_NATIVE(uv);
+ else {
+ dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
+ dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
+ }
+ if (n < num_code_blocks) {
+ if (!do_end && pRExC_state->code_blocks[n].start == s) {
+ pRExC_state->code_blocks[n].start = d;
+ assert(dst[d] == '(');
+ do_end = 1;
+ }
+ else if (do_end && pRExC_state->code_blocks[n].end == s) {
+ pRExC_state->code_blocks[n].end = d;
+ assert(dst[d] == ')');
+ do_end = 0;
+ n++;
+ }
+ }
+ s++;
+ d++;
+ }
+ dst[d] = '\0';
+ *plen_p = d;
+ *pat_p = (char*) dst;
+ SAVEFREEPV(*pat_p);
+ RExC_orig_utf8 = RExC_utf8 = 1;
+}
+
+
+
+/* S_concat_pat(): concatenate a list of args to the pattern string pat,
+ * while recording any code block indices, and handling overloading,
+ * nested qr// objects etc. If pat is null, it will allocate a new
+ * string, or just return the first arg, if there's only one.
+ *
+ * Returns the malloced/updated pat.
+ * patternp and pat_count is the array of SVs to be concatted;
+ * oplist is the optional list of ops that generated the SVs;
+ * recompile_p is a pointer to a boolean that will be set if
+ * the regex will need to be recompiled.
+ * delim, if non-null is an SV that will be inserted between each element
+ */
+
+static SV*
+S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
+ SV *pat, SV ** const patternp, int pat_count,
+ OP *oplist, bool *recompile_p, SV *delim)
+{
+ SV **svp;
+ int n = 0;
+ bool use_delim = FALSE;
+ bool alloced = FALSE;
+
+ /* if we know we have at least two args, create an empty string,
+ * then concatenate args to that. For no args, return an empty string */
+ if (!pat && pat_count != 1) {
+ pat = newSVpvn("", 0);
+ SAVEFREESV(pat);
+ alloced = TRUE;
+ }
+
+ for (svp = patternp; svp < patternp + pat_count; svp++) {
+ SV *sv;
+ SV *rx = NULL;
+ STRLEN orig_patlen = 0;
+ bool code = 0;
+ SV *msv = use_delim ? delim : *svp;
+
+ /* if we've got a delimiter, we go round the loop twice for each
+ * svp slot (except the last), using the delimiter the second
+ * time round */
+ if (use_delim) {
+ svp--;
+ use_delim = FALSE;
+ }
+ else if (delim)
+ use_delim = TRUE;
+
+ if (SvTYPE(msv) == SVt_PVAV) {
+ /* we've encountered an interpolated array within
+ * the pattern, e.g. /...@a..../. Expand the list of elements,
+ * then recursively append elements.
+ * The code in this block is based on S_pushav() */
+
+ AV *const av = (AV*)msv;
+ const I32 maxarg = AvFILL(av) + 1;
+ SV **array;
+
+ if (oplist) {
+ assert(oplist->op_type == OP_PADAV
+ || oplist->op_type == OP_RV2AV);
+ oplist = oplist->op_sibling;;
+ }
+
+ if (SvRMAGICAL(av)) {
+ U32 i;
+
+ Newx(array, maxarg, SV*);
+ SAVEFREEPV(array);
+ for (i=0; i < (U32)maxarg; i++) {
+ SV ** const svp = av_fetch(av, i, FALSE);
+ array[i] = svp ? *svp : &PL_sv_undef;
+ }
+ }
+ else
+ array = AvARRAY(av);
+
+ pat = S_concat_pat(aTHX_ pRExC_state, pat,
+ array, maxarg, NULL, recompile_p,
+ /* $" */
+ GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
+
+ continue;
+ }
+
+
+ /* we make the assumption here that each op in the list of
+ * op_siblings maps to one SV pushed onto the stack,
+ * except for code blocks, with have both an OP_NULL and
+ * and OP_CONST.
+ * This allows us to match up the list of SVs against the
+ * list of OPs to find the next code block.
+ *
+ * Note that PUSHMARK PADSV PADSV ..
+ * is optimised to
+ * PADRANGE PADSV PADSV ..
+ * so the alignment still works. */
+
+ if (oplist) {
+ if (oplist->op_type == OP_NULL
+ && (oplist->op_flags & OPf_SPECIAL))
+ {
+ assert(n < pRExC_state->num_code_blocks);
+ pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
+ pRExC_state->code_blocks[n].block = oplist;
+ pRExC_state->code_blocks[n].src_regex = NULL;
+ n++;
+ code = 1;
+ oplist = oplist->op_sibling; /* skip CONST */
+ assert(oplist);
+ }
+ oplist = oplist->op_sibling;;
+ }
+
+ /* apply magic and QR overloading to arg */
+
+ SvGETMAGIC(msv);
+ if (SvROK(msv) && SvAMAGIC(msv)) {
+ SV *sv = AMG_CALLunary(msv, regexp_amg);
+ if (sv) {
+ if (SvROK(sv))
+ sv = SvRV(sv);
+ if (SvTYPE(sv) != SVt_REGEXP)
+ Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
+ msv = sv;
+ }
+ }
+
+ /* try concatenation overload ... */
+ if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
+ (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
+ {
+ sv_setsv(pat, sv);
+ /* overloading involved: all bets are off over literal
+ * code. Pretend we haven't seen it */
+ pRExC_state->num_code_blocks -= n;
+ n = 0;
+ }
+ else {
+ /* ... or failing that, try "" overload */
+ while (SvAMAGIC(msv)
+ && (sv = AMG_CALLunary(msv, string_amg))
+ && sv != msv
+ && !( SvROK(msv)
+ && SvROK(sv)
+ && SvRV(msv) == SvRV(sv))
+ ) {
+ msv = sv;
+ SvGETMAGIC(msv);
+ }
+ if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
+ msv = SvRV(msv);
+
+ if (pat) {
+ /* this is a partially unrolled
+ * sv_catsv_nomg(pat, msv);
+ * that allows us to adjust code block indices if
+ * needed */
+ STRLEN dlen;
+ char *dst = SvPV_force_nomg(pat, dlen);
+ orig_patlen = dlen;
+ if (SvUTF8(msv) && !SvUTF8(pat)) {
+ S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
+ sv_setpvn(pat, dst, dlen);
+ SvUTF8_on(pat);
+ }
+ sv_catsv_nomg(pat, msv);
+ rx = msv;
+ }
+ else
+ pat = msv;
+
+ if (code)
+ pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
+ }
+
+ /* extract any code blocks within any embedded qr//'s */
+ if (rx && SvTYPE(rx) == SVt_REGEXP
+ && RX_ENGINE((REGEXP*)rx)->op_comp)
+ {
+
+ RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
+ if (ri->num_code_blocks) {
+ int i;
+ /* the presence of an embedded qr// with code means
+ * we should always recompile: the text of the
+ * qr// may not have changed, but it may be a
+ * different closure than last time */
+ *recompile_p = 1;
+ Renew(pRExC_state->code_blocks,
+ pRExC_state->num_code_blocks + ri->num_code_blocks,
+ struct reg_code_block);
+ pRExC_state->num_code_blocks += ri->num_code_blocks;
+
+ for (i=0; i < ri->num_code_blocks; i++) {
+ struct reg_code_block *src, *dst;
+ STRLEN offset = orig_patlen
+ + ReANY((REGEXP *)rx)->pre_prefix;
+ assert(n < pRExC_state->num_code_blocks);
+ src = &ri->code_blocks[i];
+ dst = &pRExC_state->code_blocks[n];
+ dst->start = src->start + offset;
+ dst->end = src->end + offset;
+ dst->block = src->block;
+ dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
+ src->src_regex
+ ? src->src_regex
+ : (REGEXP*)rx);
+ n++;
+ }
+ }
+ }
+ }
+ /* avoid calling magic multiple times on a single element e.g. =~ $qr */
+ if (alloced)
+ SvSETMAGIC(pat);
+
+ return pat;
+}
+
+
+
/* see if there are any run-time code blocks in the pattern.
* False positives are allowed */
static bool
-S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
- U32 pm_flags, char *pat, STRLEN plen)
+S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
+ char *pat, STRLEN plen)
{
int n = 0;
STRLEN s;
- /* avoid infinitely recursing when we recompile the pattern parcelled up
- * as qr'...'. A single constant qr// string can't have have any
- * run-time component in it, and thus, no runtime code. (A non-qr
- * string, however, can, e.g. $x =~ '(?{})') */
- if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
- return 0;
-
for (s = 0; s < plen; s++) {
if (n < pRExC_state->num_code_blocks
&& s == pRExC_state->code_blocks[n].start)
SAVETMPS;
save_re_context();
PUSHSTACKi(PERLSI_REQUIRE);
- /* this causes the toker to collapse \\ into \ when parsing
- * qr''; normally only q'' does this. It also alters hints
- * handling */
- PL_reg_state.re_reparsing = TRUE;
- eval_sv(sv, G_SCALAR);
+ /* G_RE_REPARSING causes the toker to collapse \\ into \ when
+ * parsing qr''; normally only q'' does this. It also alters
+ * hints handling */
+ eval_sv(sv, G_SCALAR|G_RE_REPARSING);
SvREFCNT_dec_NN(sv);
SPAGAIN;
qr_ref = POPs;
S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
{
/* This is the common code for setting up the floating and fixed length
- * string data extracted from Perlre_op_compile() below. Returns a boolean
+ * string data extracted from Perl_re_op_compile() below. Returns a boolean
* as to whether succeeded or not */
I32 t,ml;
regexp_internal *ri;
STRLEN plen;
char *exp;
- char* xend;
regnode *scan;
I32 flags;
I32 minlen = 0;
U32 rx_flags;
- SV *pat = NULL;
+ SV *pat;
SV *code_blocksv = NULL;
+ SV** new_patternp = patternp;
/* these are all flags - maybe they should be turned
* into a single int with different bit masks */
I32 sawplus = 0;
I32 sawopen = 0;
regex_charset initial_charset = get_regex_charset(orig_rx_flags);
- bool code_is_utf8 = 0;
bool recompile = 0;
bool runtime_code = 0;
scan_data_t data;
if (expr && (expr->op_type == OP_LIST ||
(expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
-
- /* is the source UTF8, and how many code blocks are there? */
+ /* allocate code_blocks if needed */
OP *o;
int ncode = 0;
- for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
- if (o->op_type == OP_CONST) {
- /* skip if we have SVs as well as OPs. In this case,
- * a) we decide utf8 based on SVs not OPs;
- * b) the current pad may not match that which the ops
- * were compiled in, so, so on threaded builds,
- * cSVOPo_sv would look in the wrong pad */
- if (!pat_count && SvUTF8(cSVOPo_sv))
- code_is_utf8 = 1;
- }
- else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
- /* count of DO blocks */
- ncode++;
- }
+ for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
+ if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
+ ncode++; /* count of DO blocks */
if (ncode) {
pRExC_state->num_code_blocks = ncode;
Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
}
}
- if (pat_count) {
- /* handle a list of SVs */
+ if (!pat_count) {
+ /* compile-time pattern with just OP_CONSTs and DO blocks */
- SV **svp;
- OP *o = NULL;
- int n = 0;
- STRLEN orig_patlen = 0;
+ int n;
+ OP *o;
- /* apply magic and RE overloading to each arg */
- for (svp = patternp; svp < patternp + pat_count; svp++) {
- SV *rx = *svp;
- SvGETMAGIC(rx);
- if (SvROK(rx) && SvAMAGIC(rx)) {
- SV *sv = AMG_CALLunary(rx, regexp_amg);
- if (sv) {
- if (SvROK(sv))
- sv = SvRV(sv);
- if (SvTYPE(sv) != SVt_REGEXP)
- Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
- *svp = sv;
- }
- }
- }
-
- /* process args, concat them if there are multiple ones,
- * and find any code block indexes */
+ /* find how many CONSTs there are */
+ assert(expr);
+ n = 0;
+ if (expr->op_type == OP_CONST)
+ n = 1;
+ else
+ for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
+ if (o->op_type == OP_CONST)
+ n++;
+ }
- if (pat_count > 1) {
- if (pRExC_state->num_code_blocks) {
- o = cLISTOPx(expr)->op_first;
- assert( o->op_type == OP_PUSHMARK
- || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK)
- || o->op_type == OP_PADRANGE);
- o = o->op_sibling;
- }
+ /* fake up an SV array */
- pat = newSVpvn("", 0);
- SAVEFREESV(pat);
-
- /* determine if the pattern is going to be utf8 (needed
- * in advance to align code block indices correctly).
- * XXX This could fail to be detected for an arg with
- * overloading but not concat overloading; but the main effect
- * in this obscure case is to need a 'use re eval' for a
- * literal code block */
- for (svp = patternp; svp < patternp + pat_count; svp++) {
- if (SvUTF8(*svp))
- SvUTF8_on(pat);
- }
- }
+ assert(!new_patternp);
+ Newx(new_patternp, n, SV*);
+ SAVEFREEPV(new_patternp);
+ pat_count = n;
- for (svp = patternp; svp < patternp + pat_count; svp++) {
- SV *sv, *msv = *svp;
- SV *rx = NULL;
- bool code = 0;
- /* we make the assumption here that each op in the list of
- * op_siblings maps to one SV pushed onto the stack,
- * except for code blocks, with have both an OP_NULL and
- * and OP_CONST.
- * This allows us to match up the list of SVs against the
- * list of OPs to find the next code block.
- *
- * Note that PUSHMARK PADSV PADSV ..
- * is optimised to
- * PADRANGE NULL NULL ..
- * so the alignment still works. */
- if (o) {
- if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
- assert(n < pRExC_state->num_code_blocks);
- pRExC_state->code_blocks[n].start = SvCUR(pat);
- pRExC_state->code_blocks[n].block = o;
- pRExC_state->code_blocks[n].src_regex = NULL;
- n++;
- code = 1;
- o = o->op_sibling; /* skip CONST */
- assert(o);
- }
- o = o->op_sibling;;
+ n = 0;
+ if (expr->op_type == OP_CONST)
+ new_patternp[n] = cSVOPx_sv(expr);
+ else
+ for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
+ if (o->op_type == OP_CONST)
+ new_patternp[n++] = cSVOPo_sv;
}
- /* try concatenation overload ... */
- if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
- (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
- {
- sv_setsv(pat, sv);
- /* overloading involved: all bets are off over literal
- * code. Pretend we haven't seen it */
- pRExC_state->num_code_blocks -= n;
- n = 0;
- }
- else {
- /* ... or failing that, try "" overload */
- while (SvAMAGIC(msv)
- && (sv = AMG_CALLunary(msv, string_amg))
- && sv != msv
- && !( SvROK(msv)
- && SvROK(sv)
- && SvRV(msv) == SvRV(sv))
- ) {
- msv = sv;
- SvGETMAGIC(msv);
- }
- if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
- msv = SvRV(msv);
- if (pat) {
- orig_patlen = SvCUR(pat);
- sv_catsv_nomg(pat, msv);
- rx = msv;
- }
- else
- pat = msv;
- if (code)
- pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
- }
+ }
- /* extract any code blocks within any embedded qr//'s */
- if (rx && SvTYPE(rx) == SVt_REGEXP
- && RX_ENGINE((REGEXP*)rx)->op_comp)
- {
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ "Assembling pattern from %d elements%s\n", pat_count,
+ orig_rx_flags & RXf_SPLIT ? " for split" : ""));
- RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
- if (ri->num_code_blocks) {
- int i;
- /* the presence of an embedded qr// with code means
- * we should always recompile: the text of the
- * qr// may not have changed, but it may be a
- * different closure than last time */
- recompile = 1;
- Renew(pRExC_state->code_blocks,
- pRExC_state->num_code_blocks + ri->num_code_blocks,
- struct reg_code_block);
- pRExC_state->num_code_blocks += ri->num_code_blocks;
- for (i=0; i < ri->num_code_blocks; i++) {
- struct reg_code_block *src, *dst;
- STRLEN offset = orig_patlen
- + ReANY((REGEXP *)rx)->pre_prefix;
- assert(n < pRExC_state->num_code_blocks);
- src = &ri->code_blocks[i];
- dst = &pRExC_state->code_blocks[n];
- dst->start = src->start + offset;
- dst->end = src->end + offset;
- dst->block = src->block;
- dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
- src->src_regex
- ? src->src_regex
- : (REGEXP*)rx);
- n++;
- }
- }
- }
- }
- if (pat_count > 1)
- SvSETMAGIC(pat);
+ /* set expr to the first arg op */
- /* handle bare (possibly after overloading) regex: foo =~ $re */
- {
- SV *re = pat;
- if (SvROK(re))
- re = SvRV(re);
- if (SvTYPE(re) == SVt_REGEXP) {
- if (is_bare_re)
- *is_bare_re = TRUE;
- SvREFCNT_inc(re);
- Safefree(pRExC_state->code_blocks);
- return (REGEXP*)re;
- }
- }
+ if (pRExC_state->num_code_blocks
+ && expr->op_type != OP_CONST)
+ {
+ expr = cLISTOPx(expr)->op_first;
+ assert( expr->op_type == OP_PUSHMARK
+ || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
+ || expr->op_type == OP_PADRANGE);
+ expr = expr->op_sibling;
}
- else {
- /* not a list of SVs, so must be a list of OPs */
- int i = -1;
- bool is_code = 0;
- OP *o;
- OP *ofirst, *olast;
-
- assert(expr);
-
- if (expr->op_type == OP_LIST) {
- ofirst = cLISTOPx(expr)->op_first;
- olast = cLISTOPx(expr)->op_last;
- pat = newSVpvn("", 0);
- SAVEFREESV(pat);
- if (code_is_utf8)
- SvUTF8_on(pat);
- }
- else {
- assert(expr->op_type == OP_CONST);
- ofirst = olast = expr;
- pat = NULL;
- }
- /* given a list of CONSTs and DO blocks in expr, append all
- * the CONSTs to pat, and record the start and end of each
- * code block in code_blocks[] (each DO{} op is followed by an
- * OP_CONST containing the corresponding literal '(?{...})
- * text)
- */
- o = ofirst;
- while (1) {
- if (o->op_type == OP_CONST) {
- if (pat) {
- sv_catsv(pat, cSVOPo_sv);
- if (is_code) {
- pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
- is_code = 0;
- }
- }
- else {
- pat = cSVOPx_sv(expr);
- }
- }
- else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
- assert(i+1 < pRExC_state->num_code_blocks);
- pRExC_state->code_blocks[++i].start = SvCUR(pat);
- pRExC_state->code_blocks[i].block = o;
- pRExC_state->code_blocks[i].src_regex = NULL;
- is_code = 1;
- }
- if (o == olast)
- break;
- o = o->op_sibling;
- }
+ pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
+ expr, &recompile, NULL);
+
+ /* handle bare (possibly after overloading) regex: foo =~ $re */
+ {
+ SV *re = pat;
+ if (SvROK(re))
+ re = SvRV(re);
+ if (SvTYPE(re) == SVt_REGEXP) {
+ if (is_bare_re)
+ *is_bare_re = TRUE;
+ SvREFCNT_inc(re);
+ Safefree(pRExC_state->code_blocks);
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ "Precompiled pattern%s\n",
+ orig_rx_flags & RXf_SPLIT ? " for split" : ""));
+
+ return (REGEXP*)re;
+ }
}
exp = SvPV_nomg(pat, plen);
- xend = exp + plen;
if (!eng->op_comp) {
if ((SvUTF8(pat) && IN_BYTES)
PL_colors[4],PL_colors[5],s);
});
- if (0) {
- redo_first_pass:
- {
- U8 *const src = (U8*)exp;
- U8 *dst;
- int n=0;
- STRLEN s = 0, d = 0;
- bool do_end = 0;
+ redo_first_pass:
+ /* we jump here if we upgrade the pattern to utf8 and have to
+ * recompile */
- /* It's possible to write a regexp in ascii that represents Unicode
- codepoints outside of the byte range, such as via \x{100}. If we
- detect such a sequence we have to convert the entire pattern to utf8
- and then recompile, as our sizing calculation will have been based
- on 1 byte == 1 character, but we will need to use utf8 to encode
- at least some part of the pattern, and therefore must convert the whole
- thing.
- -- dmq */
- DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
- "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
-
- /* upgrade pattern to UTF8, and if there are code blocks,
- * recalculate the indices.
- * This is essentially an unrolled Perl_bytes_to_utf8() */
-
- Newx(dst, plen * 2 + 1, U8);
-
- while (s < plen) {
- const UV uv = NATIVE_TO_ASCII(src[s]);
- if (UNI_IS_INVARIANT(uv))
- dst[d] = (U8)UTF_TO_NATIVE(uv);
- else {
- dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
- dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
- }
- if (n < pRExC_state->num_code_blocks) {
- if (!do_end && pRExC_state->code_blocks[n].start == s) {
- pRExC_state->code_blocks[n].start = d;
- assert(dst[d] == '(');
- do_end = 1;
- }
- else if (do_end && pRExC_state->code_blocks[n].end == s) {
- pRExC_state->code_blocks[n].end = d;
- assert(dst[d] == ')');
- do_end = 0;
- n++;
- }
- }
- s++;
- d++;
- }
- dst[d] = '\0';
- plen = d;
- exp = (char*) dst;
- xend = exp + plen;
- SAVEFREEPV(exp);
- RExC_orig_utf8 = RExC_utf8 = 1;
- }
- }
+ if ((pm_flags & PMf_USE_RE_EVAL)
+ /* this second condition covers the non-regex literal case,
+ * i.e. $foo =~ '(?{})'. */
+ || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
+ )
+ runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
/* return old regex if pattern hasn't changed */
/* XXX: note in the below we have to check the flags as well as the pattern.
&& ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
&& RX_PRECOMP(old_re)
&& RX_PRELEN(old_re) == plen
- && memEQ(RX_PRECOMP(old_re), exp, plen))
+ && memEQ(RX_PRECOMP(old_re), exp, plen)
+ && !runtime_code /* with runtime code, always recompile */ )
{
- /* with runtime code, always recompile */
- runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
- exp, plen);
- if (!runtime_code) {
- Safefree(pRExC_state->code_blocks);
- return old_re;
- }
+ Safefree(pRExC_state->code_blocks);
+ return old_re;
}
- else if ((pm_flags & PMf_USE_RE_EVAL)
- /* this second condition covers the non-regex literal case,
- * i.e. $foo =~ '(?{})'. */
- || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
- && (PL_hints & HINT_RE_EVAL))
- )
- runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
- exp, plen);
rx_flags = orig_rx_flags;
if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
/* whoops, we have a non-utf8 pattern, whilst run-time code
* got compiled as utf8. Try again with a utf8 pattern */
+ S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
+ pRExC_state->num_code_blocks);
goto redo_first_pass;
}
}
/* First pass: determine size, legality. */
RExC_parse = exp;
RExC_start = exp;
- RExC_end = xend;
+ RExC_end = exp + plen;
RExC_naughty = 0;
RExC_npar = 1;
RExC_nestroot = 0;
RExC_size = 0L;
- RExC_emit = &PL_regdummy;
+ RExC_emit = &RExC_emit_dummy;
RExC_whilem_seen = 0;
RExC_open_parens = NULL;
RExC_close_parens = NULL;
SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
}
if (reg(pRExC_state, 0, &flags,1) == NULL) {
+ /* It's possible to write a regexp in ascii that represents Unicode
+ codepoints outside of the byte range, such as via \x{100}. If we
+ detect such a sequence we have to convert the entire pattern to utf8
+ and then recompile, as our sizing calculation will have been based
+ on 1 byte == 1 character, but we will need to use utf8 to encode
+ at least some part of the pattern, and therefore must convert the whole
+ thing.
+ -- dmq */
if (flags & RESTART_UTF8) {
+ S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
+ pRExC_state->num_code_blocks);
goto redo_first_pass;
}
- Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#X", flags);
+ Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
}
if (code_blocksv)
SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
RExC_flags = rx_flags; /* don't let top level (?i) bleed */
RExC_pm_flags = pm_flags;
RExC_parse = exp;
- RExC_end = xend;
+ RExC_end = exp + plen;
RExC_naughty = 0;
RExC_npar = 1;
RExC_emit_start = ri->program;
REGC((U8)REG_MAGIC, (char*) RExC_emit++);
if (reg(pRExC_state, 0, &flags,1) == NULL) {
ReREFCNT_dec(rx);
- Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#X", flags);
+ Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
}
/* XXXX To minimize changes to RE engine we always allocate
3-units-long substrs field. */
minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
&data, -1, NULL, NULL,
- SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
+ SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
+ | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
+ 0);
CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
- &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
+ &data, -1, NULL, NULL,
+ SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS
+ |(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
+ 0);
CHECK_RESTUDY_GOTO_butfirst(NOOP);
#define WASTED_O 0x01
#define WASTED_G 0x02
#define WASTED_C 0x04
-#define WASTED_GC (0x02|0x04)
+#define WASTED_GC (WASTED_G|WASTED_C)
I32 wastedflags = 0x00;
U32 posflags = 0, negflags = 0;
U32 *flagsp = &posflags;
const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
if (! (wastedflags & wflagbit) ) {
wastedflags |= wflagbit;
+ /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
vWARN5(
RExC_parse + 1,
"Useless (%s%c) - %suse /%c modifier",
if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
if (! (wastedflags & WASTED_C) ) {
wastedflags |= WASTED_GC;
+ /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
vWARN3(
RExC_parse + 1,
"Useless (%sc) - %suse /gc modifier",
cannot happen. */
STATIC regnode *
S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
- /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
+ /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
+ * 2 is like 1, but indicates that nextchar() has been called to advance
+ * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
+ * this flag alerts us to the need to check for that */
{
dVAR;
regnode *ret; /* Will be the head of the group. */
/* Make an OPEN node, if parenthesized. */
if (paren) {
+
+ /* Under /x, space and comments can be gobbled up between the '(' and
+ * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
+ * intervening space, as the sequence is a token, and a token should be
+ * indivisible */
+ bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
+
if ( *RExC_parse == '*') { /* (*VERB:ARG) */
char *start_verb = RExC_parse;
STRLEN verb_len = 0;
unsigned char op = 0;
int argok = 1;
int internal_argval = 0; /* internal_argval is only useful if !argok */
+
+ if (has_intervening_patws && SIZE_ONLY) {
+ ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
+ }
while ( *RExC_parse && *RExC_parse != ')' ) {
if ( *RExC_parse == ':' ) {
start_arg = RExC_parse + 1;
}
nextchar(pRExC_state);
return ret;
- } else
- if (*RExC_parse == '?') { /* (?...) */
+ }
+ else if (*RExC_parse == '?') { /* (?...) */
bool is_logical = 0;
const char * const seqstart = RExC_parse;
+ if (has_intervening_patws && SIZE_ONLY) {
+ ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
+ }
RExC_parse++;
paren = *RExC_parse++;
*flagp |= HASWIDTH;
Set_Node_Offset(ret, parse_start+1);
- Set_Node_Cur_Length(ret); /* MJD */
+ Set_Node_Cur_Length(ret, parse_start);
nextchar(pRExC_state);
return ret;
case '@': /* (?@...) */
vFAIL2("Sequence (?%c...) not implemented", (int)paren);
break;
+ case '#': /* (?#...) */
+ /* XXX As soon as we disallow separating the '?' and '*' (by
+ * spaces or (?#...) comment), it is believed that this case
+ * will be unreachable and can be removed. See
+ * [perl #117327] */
+ while (*RExC_parse && *RExC_parse != ')')
+ RExC_parse++;
+ if (*RExC_parse != ')')
+ FAIL("Sequence (?#... not terminated");
+ nextchar(pRExC_state);
+ *flagp = TRYAGAIN;
+ return NULL;
case '0' : /* (?0) */
case 'R' : /* (?R) */
if (*RExC_parse != ')')
*flagp = RESTART_UTF8;
return NULL;
}
- FAIL2("panic: regbranch returned NULL, flags=%#X",
- flags);
+ FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
+ (UV) flags);
} else
REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
c = *nextchar(pRExC_state);
*flagp = RESTART_UTF8;
return NULL;
}
- FAIL2("panic: regbranch returned NULL, flags=%#X",
- flags);
+ FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
+ (UV) flags);
}
REGTAIL(pRExC_state, ret, lastbr);
if (flags&HASWIDTH)
*flagp = RESTART_UTF8;
return NULL;
}
- FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
+ FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
}
if (*RExC_parse == '|') {
if (!SIZE_ONLY && RExC_extralen) {
*flagp = RESTART_UTF8;
return NULL;
}
- FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
+ FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
}
REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
lastbr = br;
case ':':
ender = reg_node(pRExC_state, TAIL);
break;
- case 1:
+ case 1: case 2:
ender = reganode(pRExC_state, CLOSE, parno);
if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
if (paren == '>')
node = SUSPEND, flag = 0;
reginsert(pRExC_state, node,ret, depth+1);
- Set_Node_Cur_Length(ret);
+ Set_Node_Cur_Length(ret, parse_start);
Set_Node_Offset(ret, parse_start + 1);
ret->flags = flag;
REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
/* Check for proper termination. */
if (paren) {
- RExC_flags = oregflags;
+ /* restore original flags, but keep (?p) */
+ RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
RExC_parse = oregcomp_parse;
vFAIL("Unmatched (");
*flagp = RESTART_UTF8;
return NULL;
}
- FAIL2("panic: regpiece returned NULL, flags=%#X", flags);
+ FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
}
else if (ret == NULL)
ret = latest;
if (flags & (TRYAGAIN|RESTART_UTF8))
*flagp |= flags & (TRYAGAIN|RESTART_UTF8);
else
- FAIL2("panic: regatom returned NULL, flags=%#X", flags);
+ FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
return(NULL);
}
RExC_emit = orig_emit;
}
ret = reg_node(pRExC_state, NOTHING);
+
+ /* But the quantifier includes any '?' (the non-greedy
+ * modifier) after the {}, [perl #118375] */
+ if (RExC_parse < RExC_end && *RExC_parse == '?') {
+ nextchar(pRExC_state);
+ }
return ret;
}
RExC_naughty += 2 + RExC_naughty / 2;
reginsert(pRExC_state, CURLY, ret, depth+1);
Set_Node_Offset(ret, parse_start+1); /* MJD */
- Set_Node_Cur_Length(ret);
+ Set_Node_Cur_Length(ret, parse_start);
}
else {
regnode * const w = reg_node(pRExC_state, WHILEM);
*flagp = RESTART_UTF8;
return FALSE;
}
- FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#X",
- flags);
+ FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
+ (UV) flags);
}
*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
* additionally will populate the node's STRING with <code_point>, if <len>
* is 0. In both cases <*flagp> is appropriately set
*
- * It knows that under FOLD, UTF characters and the Latin Sharp S must be
- * folded (the latter only when the rules indicate it can match 'ss') */
+ * It knows that under FOLD, the Latin Sharp S and UTF characters above
+ * 255, must be folded (the former only when the rules indicate it can
+ * match 'ss') */
bool len_passed_in = cBOOL(len != 0);
U8 character[UTF8_MAXBYTES_CASE+1];
if (! len_passed_in) {
if (UTF) {
- if (FOLD) {
- to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
+ if (FOLD && (! LOC || code_point > 255)) {
+ _to_uni_fold_flags(NATIVE_TO_UNI(code_point),
+ character,
+ &len,
+ FOLD_FLAGS_FULL | ((LOC)
+ ? FOLD_FLAGS_LOCALE
+ : (ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0));
}
else {
uvchr_to_utf8( character, code_point);
if (ret == NULL) {
if (*flagp & RESTART_UTF8)
return NULL;
- FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
- *flagp);
+ FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
+ (UV) *flagp);
}
nextchar(pRExC_state);
Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
}
case '(':
nextchar(pRExC_state);
- ret = reg(pRExC_state, 1, &flags,depth+1);
+ ret = reg(pRExC_state, 2, &flags,depth+1);
if (ret == NULL) {
if (flags & TRYAGAIN) {
if (RExC_parse == RExC_end) {
*flagp = RESTART_UTF8;
return NULL;
}
- FAIL2("panic: reg returned NULL to regatom, flags=%#X", flags);
+ FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags);
}
*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
break;
/* regclass() can only return RESTART_UTF8 if multi-char folds
are allowed. */
if (!ret)
- FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
- *flagp);
+ FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
+ (UV) *flagp);
RExC_parse--;
Set_Node_Offset(ret, parse_start + 2);
- Set_Node_Cur_Length(ret);
+ Set_Node_Cur_Length(ret, parse_start);
nextchar(pRExC_state);
}
break;
/* override incorrect value set in reganode MJD */
Set_Node_Offset(ret, parse_start+1);
- Set_Node_Cur_Length(ret); /* MJD */
+ Set_Node_Cur_Length(ret, parse_start);
nextchar(pRExC_state);
}
goto parse_named_seq;
} }
num = atoi(RExC_parse);
- if (isg && num == 0)
- vFAIL("Reference to invalid group 0");
+ if (isg && num == 0) {
+ if (*RExC_parse == '0') {
+ vFAIL("Reference to invalid group 0");
+ }
+ else {
+ vFAIL("Unterminated \\g... pattern");
+ }
+ }
if (isrel) {
num = RExC_npar - num;
if (num < 1)
/* Probably a character specified in octal, e.g. \35 */
goto defchar;
else {
+#ifdef RE_TRACK_PATTERN_OFFSETS
char * const parse_start = RExC_parse - 1; /* MJD */
+#endif
while (isDIGIT(*RExC_parse))
RExC_parse++;
- if (parse_start == RExC_parse - 1)
- vFAIL("Unterminated \\g... pattern");
if (hasbrace) {
if (*RExC_parse != '}')
vFAIL("Unterminated \\g{...} pattern");
/* override incorrect value set in reganode MJD */
Set_Node_Offset(ret, parse_start+1);
- Set_Node_Cur_Length(ret); /* MJD */
+ Set_Node_Cur_Length(ret, parse_start);
RExC_parse--;
nextchar(pRExC_state);
}
if (! SIZE_ONLY
&& RExC_flags & RXf_PMf_EXTENDED
- && ckWARN(WARN_DEPRECATED)
+ && ckWARN_d(WARN_DEPRECATED)
&& is_PATWS_non_low(p, UTF))
{
vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
goto loopdone;
}
- if (FOLD) {
- if (UTF
- /* See comments for join_exact() as to why we fold
- * this non-UTF at compile time */
- || (node_type == EXACTFU
- && ender == LATIN_SMALL_LETTER_SHARP_S))
- {
-
-
- /* Prime the casefolded buffer. Locale rules, which
- * apply only to code points < 256, aren't known until
- * execution, so for them, just output the original
- * character using utf8. If we start to fold non-UTF
- * patterns, be sure to update join_exact() */
- if (LOC && ender < 256) {
- if (UNI_IS_INVARIANT(ender)) {
- *s = (U8) ender;
- foldlen = 1;
- } else {
- *s = UTF8_TWO_BYTE_HI(ender);
- *(s + 1) = UTF8_TWO_BYTE_LO(ender);
- foldlen = 2;
- }
+ if (! FOLD) {
+ if (UTF) {
+ const STRLEN unilen = reguni(pRExC_state, ender, s);
+ if (unilen > 0) {
+ s += unilen;
+ len += unilen;
}
- else {
- UV folded = _to_uni_fold_flags(
- ender,
- (U8 *) s,
- &foldlen,
- FOLD_FLAGS_FULL
- | ((LOC) ? FOLD_FLAGS_LOCALE
- : (ASCII_FOLD_RESTRICTED)
- ? FOLD_FLAGS_NOMIX_ASCII
- : 0)
- );
- /* If this node only contains non-folding code
- * points so far, see if this new one is also
- * non-folding */
- if (maybe_exact) {
- if (folded != ender) {
- maybe_exact = FALSE;
+ /* The loop increments <len> each time, as all but this
+ * path (and one other) through it add a single byte to
+ * the EXACTish node. But this one has changed len to
+ * be the correct final value, so subtract one to
+ * cancel out the increment that follows */
+ len--;
+ }
+ else {
+ REGC((char)ender, s++);
+ }
+ }
+ else /* FOLD */
+ if (! ( UTF
+ /* See comments for join_exact() as to why we fold this
+ * non-UTF at compile time */
+ || (node_type == EXACTFU
+ && ender == LATIN_SMALL_LETTER_SHARP_S)))
+ {
+ *(s++) = (char) ender;
+ maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
+ }
+ else { /* UTF */
+
+ /* Prime the casefolded buffer. Locale rules, which apply
+ * only to code points < 256, aren't known until execution,
+ * so for them, just output the original character using
+ * utf8. If we start to fold non-UTF patterns, be sure to
+ * update join_exact() */
+ if (LOC && ender < 256) {
+ if (UNI_IS_INVARIANT(ender)) {
+ *s = (U8) ender;
+ foldlen = 1;
+ } else {
+ *s = UTF8_TWO_BYTE_HI(ender);
+ *(s + 1) = UTF8_TWO_BYTE_LO(ender);
+ foldlen = 2;
+ }
+ }
+ else {
+ UV folded = _to_uni_fold_flags(
+ ender,
+ (U8 *) s,
+ &foldlen,
+ FOLD_FLAGS_FULL
+ | ((LOC) ? FOLD_FLAGS_LOCALE
+ : (ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0)
+ );
+
+ /* If this node only contains non-folding code points
+ * so far, see if this new one is also non-folding */
+ if (maybe_exact) {
+ if (folded != ender) {
+ maybe_exact = FALSE;
+ }
+ else {
+ /* Here the fold is the original; we have
+ * to check further to see if anything
+ * folds to it */
+ if (! PL_utf8_foldable) {
+ SV* swash = swash_init("utf8",
+ "_Perl_Any_Folds",
+ &PL_sv_undef, 1, 0);
+ PL_utf8_foldable =
+ _get_swash_invlist(swash);
+ SvREFCNT_dec_NN(swash);
}
- else {
- /* Here the fold is the original; we have
- * to check further to see if anything
- * folds to it */
- if (! PL_utf8_foldable) {
- SV* swash = swash_init("utf8",
- "_Perl_Any_Folds",
- &PL_sv_undef, 1, 0);
- PL_utf8_foldable =
- _get_swash_invlist(swash);
- SvREFCNT_dec_NN(swash);
- }
- if (_invlist_contains_cp(PL_utf8_foldable,
- ender))
- {
- maybe_exact = FALSE;
- }
+ if (_invlist_contains_cp(PL_utf8_foldable,
+ ender))
+ {
+ maybe_exact = FALSE;
}
}
- ender = folded;
}
- s += foldlen;
-
- /* The loop increments <len> each time, as all but this
- * path (and the one just below for UTF) through it add
- * a single byte to the EXACTish node. But this one
- * has changed len to be the correct final value, so
- * subtract one to cancel out the increment that
- * follows */
- len += foldlen - 1;
- }
- else {
- *(s++) = (char) ender;
- maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
+ ender = folded;
}
+ s += foldlen;
+
+ /* The loop increments <len> each time, as all but this
+ * path (and one other) through it add a single byte to the
+ * EXACTish node. But this one has changed len to be the
+ * correct final value, so subtract one to cancel out the
+ * increment that follows */
+ len += foldlen - 1;
}
- else if (UTF) {
- const STRLEN unilen = reguni(pRExC_state, ender, s);
- if (unilen > 0) {
- s += unilen;
- len += unilen;
- }
-
- /* See comment just above for - 1 */
- len--;
- }
- else {
- REGC((char)ender, s++);
- }
if (next_is_quantifier) {
}
RExC_parse = p - 1;
- Set_Node_Cur_Length(ret); /* MJD */
+ Set_Node_Cur_Length(ret, parse_start);
nextchar(pRExC_state);
{
/* len is STRLEN which is unsigned, need to copy to signed */
* these things, we need to realize that something preceded by a backslash
* is escaped, so we have to keep track of backslashes */
if (SIZE_ONLY) {
+ UV depth = 0; /* how many nested (?[...]) constructs */
Perl_ck_warner_d(aTHX_
packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
RExC_parse = regpatws(pRExC_state, RExC_parse,
TRUE); /* means recognize comments */
switch (*RExC_parse) {
+ case '?':
+ if (RExC_parse[1] == '[') depth++, RExC_parse++;
+ /* FALL THROUGH */
default:
break;
case '\\':
FALSE, /* don't allow multi-char folds */
TRUE, /* silence non-portable warnings. */
¤t))
- FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
- *flagp);
+ FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
+ (UV) *flagp);
/* function call leaves parse pointing to the ']', except
* if we faked it */
}
case ']':
+ if (depth--) break;
RExC_parse++;
if (RExC_parse < RExC_end
&& *RExC_parse == ')')
* been parsed and evaluated to a single operand (or else is a syntax
* error), and is handled as a regular operand */
- stack = newAV();
+ sv_2mortal((SV *)(stack = newAV()));
while (RExC_parse < RExC_end) {
I32 top_index = av_tindex(stack);
FALSE, /* don't allow multi-char folds */
FALSE, /* don't silence non-portable warnings. */
¤t))
- FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
- *flagp);
+ FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
+ (UV) *flagp);
/* regclass() will return with parsing just the \ sequence,
* leaving the parse pointer at the next thing to parse */
RExC_parse--;
FALSE, /* don't allow multi-char folds */
FALSE, /* don't silence non-portable warnings. */
¤t))
- FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
- *flagp);
+ FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
+ (UV) *flagp);
/* function call leaves parse pointing to the ']', except if we
* faked it */
if (is_posix_class) {
|| IS_OPERAND(lparen)
|| SvUV(lparen) != '(')
{
+ SvREFCNT_dec(current);
RExC_parse++;
vFAIL("Unexpected ')'");
}
}
else {
SV* top = av_pop(stack);
+ SV *prev = NULL;
char current_operator;
if (IS_OPERAND(top)) {
+ SvREFCNT_dec_NN(top);
+ SvREFCNT_dec_NN(current);
vFAIL("Operand with no preceding operator");
}
current_operator = (char) SvUV(top);
goto handle_operand;
case '&':
- _invlist_intersection(av_pop(stack),
+ prev = av_pop(stack);
+ _invlist_intersection(prev,
current,
¤t);
av_push(stack, current);
case '|':
case '+':
- _invlist_union(av_pop(stack), current, ¤t);
+ prev = av_pop(stack);
+ _invlist_union(prev, current, ¤t);
av_push(stack, current);
break;
case '-':
- _invlist_subtract(av_pop(stack), current, ¤t);
+ prev = av_pop(stack);;
+ _invlist_subtract(prev, current, ¤t);
av_push(stack, current);
break;
SV* u = NULL;
SV* element;
- element = av_pop(stack);
- _invlist_union(element, current, &u);
- _invlist_intersection(element, current, &i);
+ prev = av_pop(stack);
+ _invlist_union(prev, current, &u);
+ _invlist_intersection(prev, current, &i);
+ /* _invlist_subtract will overwrite current
+ without freeing what it already contains */
+ element = current;
_invlist_subtract(u, i, ¤t);
av_push(stack, current);
SvREFCNT_dec_NN(i);
Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
}
SvREFCNT_dec_NN(top);
+ SvREFCNT_dec(prev);
}
}
RExC_end = save_end;
SvREFCNT_dec_NN(final);
SvREFCNT_dec_NN(result_string);
- SvREFCNT_dec_NN(stack);
nextchar(pRExC_state);
Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
/* <multi_char_matches> is actually an array of arrays.
* There will be one or two top-level elements: [2],
* and/or [3]. The [2] element is an array, each
- * element thereof is a character which folds to two
- * characters; likewise for [3]. (Unicode guarantees a
- * maximum of 3 characters in any fold.) When we
- * rewrite the character class below, we will do so
- * such that the longest folds are written first, so
- * that it prefers the longest matching strings first.
- * This is done even if it turns out that any
- * quantifier is non-greedy, out of programmer
- * laziness. Tom Christiansen has agreed that this is
- * ok. This makes the test for the ligature 'ffi' come
- * before the test for 'ff' */
+ * element thereof is a character which folds to TWO
+ * characters; [3] is for folds to THREE characters.
+ * (Unicode guarantees a maximum of 3 characters in any
+ * fold.) When we rewrite the character class below,
+ * we will do so such that the longest folds are
+ * written first, so that it prefers the longest
+ * matching strings first. This is done even if it
+ * turns out that any quantifier is non-greedy, out of
+ * programmer laziness. Tom Christiansen has agreed
+ * that this is ok. This makes the test for the
+ * ligature 'ffi' come before the test for 'ff' */
if (av_exists(multi_char_matches, cp_count)) {
this_array_ptr = (AV**) av_fetch(multi_char_matches,
cp_count, FALSE);
* doesn't allow them between above and below 256 */
if ((ASCII_FOLD_RESTRICTED
&& (isASCII(c) != isASCII(j)))
- || (LOC && ((c < 256) != (j < 256))))
- {
+ || (LOC && c < 256)) {
continue;
}
if (ret_invlist) {
*ret_invlist = cp_list;
+ SvREFCNT_dec(swash);
/* Discard the generated node */
if (SIZE_ONLY) {
{
dVAR;
- struct re_save_state *state;
-
- SAVEVPTR(PL_curcop);
- SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
-
- state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
- PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
- SSPUSHUV(SAVEt_RE_STATE);
-
- Copy(&PL_reg_state, state, 1, struct re_save_state);
-
- PL_reg_oldsaved = NULL;
- PL_reg_oldsavedlen = 0;
- PL_reg_oldsavedoffset = 0;
- PL_reg_oldsavedcoffset = 0;
- PL_reg_maxiter = 0;
- PL_reg_leftiter = 0;
- PL_reg_poscache = NULL;
- PL_reg_poscache_size = 0;
-#ifdef PERL_ANY_COW
- PL_nrs = NULL;
-#endif
-
/* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
if (PL_curpm) {
const REGEXP * const rx = PM_GETRE(PL_curpm);