I32 whilem_seen; /* number of WHILEM in this expr */
regnode *emit_start; /* Start of emitted-code area */
regnode *emit_bound; /* First regnode outside of the allocated space */
- regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
+ regnode *emit; /* Code-emit pointer; if = &emit_dummy,
+ implies compiling, so don't emit */
+ regnode emit_dummy; /* placeholder for emit to point to */
I32 naughty; /* How bad is this pattern? */
I32 sawback; /* Did we see \1, ...? */
U32 seen;
#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
#endif
#define RExC_emit (pRExC_state->emit)
+#define RExC_emit_dummy (pRExC_state->emit_dummy)
#define RExC_emit_start (pRExC_state->emit_start)
#define RExC_emit_bound (pRExC_state->emit_bound)
#define RExC_naughty (pRExC_state->naughty)
#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
#define SCF_SEEN_ACCEPT 0x8000
+#define SCF_TRIE_DOING_RESTUDY 0x10000
#define UTF cBOOL(RExC_utf8)
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
-#define ckWARN2regdep(loc,m, a1) STMT_START { \
+#define ckWARN2reg_d(loc,m, a1) STMT_START { \
const IV offset = loc - RExC_precomp; \
- Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
m REPORT_LOCATION, \
a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
#define Set_Cur_Node_Offset
#define Set_Node_Length_To_R(node,len)
#define Set_Node_Length(node,len)
-#define Set_Node_Cur_Length(node)
+#define Set_Node_Cur_Length(node,start)
#define Node_Offset(n)
#define Node_Length(n)
#define Set_Node_Offset_Length(node,offset,len)
#define Set_Node_Length(node,len) \
Set_Node_Length_To_R((node)-RExC_emit_start, len)
-#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
-#define Set_Node_Cur_Length(node) \
- Set_Node_Length(node, RExC_parse - parse_start)
+#define Set_Node_Cur_Length(node, start) \
+ Set_Node_Length(node, RExC_parse - start)
/* Get offsets and lengths */
#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
}
/* These two functions currently do the exact same thing */
-#define cl_init_zero S_cl_init
+#define cl_init_zero cl_init
/* 'AND' a given class with another one. Can create false positives. 'cl'
* should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
len = 0; \
} else { \
len = 1; \
- uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
+ uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \
skiplen = UNISKIP(uvc); \
foldlen -= skiplen; \
scan = foldbuf + skiplen; \
* this file makes sure that in EXACTFU nodes, the sharp s gets folded to
* 'ss', even if the pattern isn't UTF-8. This avoids the issues
* described in the next item.
- * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
- * 'ss' or not is not knowable at compile time. It will match iff the
- * target string is in UTF-8, unlike the EXACTFU nodes, where it always
- * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
- * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
- * described in item 3). An assumption that the optimizer part of
- * regexec.c (probably unwittingly) makes is that a character in the
- * pattern corresponds to at most a single character in the target string.
- * (And I do mean character, and not byte here, unlike other parts of the
- * documentation that have never been updated to account for multibyte
- * Unicode.) This assumption is wrong only in this case, as all other
- * cases are either 1-1 folds when no UTF-8 is involved; or is true by
- * virtue of having this file pre-fold UTF-8 patterns. I'm
- * reluctant to try to change this assumption, so instead the code punts.
- * This routine examines EXACTF nodes for the sharp s, and returns a
- * boolean indicating whether or not the node is an EXACTF node that
- * contains a sharp s. When it is true, the caller sets a flag that later
- * causes the optimizer in this file to not set values for the floating
- * and fixed string lengths, and thus avoids the optimizer code in
- * regexec.c that makes the invalid assumption. Thus, there is no
- * optimization based on string lengths for EXACTF nodes that contain the
- * sharp s. This only happens for /id rules (which means the pattern
- * isn't in UTF-8).
+ * 4) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
+ * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
+ * UTF-8 pattern.) An assumption that the optimizer part of regexec.c
+ * (probably unwittingly, in Perl_regexec_flags()) makes is that a
+ * character in the pattern corresponds to at most a single character in
+ * the target string. (And I do mean character, and not byte here, unlike
+ * other parts of the documentation that have never been updated to
+ * account for multibyte Unicode.) sharp s in EXACTF nodes can match the
+ * two character string 'ss'; in EXACTFA nodes it can match
+ * "\x{17F}\x{17F}". These violate the assumption, and they are the only
+ * instances where it is violated. I'm reluctant to try to change the
+ * assumption, as the code involved is impenetrable to me (khw), so
+ * instead the code here punts. This routine examines (when the pattern
+ * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
+ * boolean indicating whether or not the node contains a sharp s. When it
+ * is true, the caller sets a flag that later causes the optimizer in this
+ * file to not set values for the floating and fixed string lengths, and
+ * thus avoids the optimizer code in regexec.c that makes the invalid
+ * assumption. Thus, there is no optimization based on string lengths for
+ * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
+ * (The reason the assumption is wrong only in these two cases is that all
+ * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
+ * other folds to their expanded versions. We can't prefold sharp s to
+ * 'ss' in EXACTF nodes because we don't know at compile time if it
+ * actually matches 'ss' or not. It will match iff the target string is
+ * in UTF-8, unlike the EXACTFU nodes, where it always matches; and
+ * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8
+ * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
+ * but in a non-UTF8 pattern, folding it to that above-Latin1 string would
+ * require the pattern to be forced into UTF-8, the overhead of which we
+ * want to avoid.)
*/
#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
OP(scan) = EXACTFU_SS;
s += 2;
}
- else if (len == 6 /* len is the same in both ASCII and EBCDIC for these */
+ else if (len == 6 /* len is the same in both ASCII and EBCDIC
+ for these */
&& (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
COMBINING_DIAERESIS_UTF8
COMBINING_ACUTE_ACCENT_UTF8,
next_iteration: ;
}
}
- else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
+ else if (OP(scan) == EXACTFA) {
- /* Here, the pattern is not UTF-8. Look for the multi-char folds
- * that are all ASCII. As in the above case, EXACTFL and EXACTFA
- * nodes can't have multi-char folds to this range (and there are
- * no existing ones in the upper latin1 range). In the EXACTF
- * case we look also for the sharp s, which can be in the final
+ /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
+ * fold to the ASCII range (and there are no existing ones in the
+ * upper latin1 range). But, as outlined in the comments preceding
+ * this function, we need to flag any occurrences of the sharp s */
+ while (s < s_end) {
+ if (*s == LATIN_SMALL_LETTER_SHARP_S) {
+ *has_exactf_sharp_s = TRUE;
+ break;
+ }
+ s++;
+ continue;
+ }
+ }
+ else if (OP(scan) != EXACTFL) {
+
+ /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the
+ * multi-char folds that are all Latin1. (This code knows that
+ * there are no current multi-char folds possible with EXACTFL,
+ * relying on fold_grind.t to catch any errors if the very unlikely
+ * event happens that some get added in future Unicode versions.)
+ * As explained in the comments preceding this function, we look
+ * also for the sharp s in EXACTF nodes; it can be in the final
* position. Otherwise we can stop looking 1 byte earlier because
* have to find at least two characters for a multi-fold */
const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
}
if (!scan) /* It was not CURLYX, but CURLY. */
scan = next;
- if ( /* ? quantifier ok, except for (?{ ... }) */
- (next_is_eval || !(mincount == 0 && maxcount == 1))
+ if (!(flags & SCF_TRIE_DOING_RESTUDY)
+ /* ? quantifier ok, except for (?{ ... }) */
+ && (next_is_eval || !(mincount == 0 && maxcount == 1))
&& (minnext == 0) && (deltanext == 0)
&& data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
&& maxcount <= REG_INFTY/3) /* Complement check for big count */
static void
S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
- char **pat_p, STRLEN *plen_p)
+ char **pat_p, STRLEN *plen_p, int num_code_blocks)
{
U8 *const src = (U8*)*pat_p;
U8 *dst;
dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
}
- if (n < pRExC_state->num_code_blocks) {
+ if (n < num_code_blocks) {
if (!do_end && pRExC_state->code_blocks[n].start == s) {
pRExC_state->code_blocks[n].start = d;
assert(dst[d] == '(');
/* S_concat_pat(): concatenate a list of args to the pattern string pat,
* while recording any code block indices, and handling overloading,
- * nested qr// objects etc.
- * Returns pat (or the first arg, if pat was null , i.e. there is only
- * one arg).
+ * nested qr// objects etc. If pat is null, it will allocate a new
+ * string, or just return the first arg, if there's only one.
+ *
+ * Returns the malloced/updated pat.
* patternp and pat_count is the array of SVs to be concatted;
* oplist is the optional list of ops that generated the SVs;
* recompile_p is a pointer to a boolean that will be set if
* the regex will need to be recompiled.
+ * delim, if non-null is an SV that will be inserted between each element
*/
static SV*
S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
SV *pat, SV ** const patternp, int pat_count,
- OP *oplist, bool *recompile_p)
+ OP *oplist, bool *recompile_p, SV *delim)
{
SV **svp;
int n = 0;
+ bool use_delim = FALSE;
+ bool alloced = FALSE;
- assert(!pat || pat_count > 1);
+ /* if we know we have at least two args, create an empty string,
+ * then concatenate args to that. For no args, return an empty string */
+ if (!pat && pat_count != 1) {
+ pat = newSVpvn("", 0);
+ SAVEFREESV(pat);
+ alloced = TRUE;
+ }
for (svp = patternp; svp < patternp + pat_count; svp++) {
SV *sv;
SV *rx = NULL;
STRLEN orig_patlen = 0;
bool code = 0;
- SV *msv = *svp;
+ SV *msv = use_delim ? delim : *svp;
+
+ /* if we've got a delimiter, we go round the loop twice for each
+ * svp slot (except the last), using the delimiter the second
+ * time round */
+ if (use_delim) {
+ svp--;
+ use_delim = FALSE;
+ }
+ else if (delim)
+ use_delim = TRUE;
+
+ if (SvTYPE(msv) == SVt_PVAV) {
+ /* we've encountered an interpolated array within
+ * the pattern, e.g. /...@a..../. Expand the list of elements,
+ * then recursively append elements.
+ * The code in this block is based on S_pushav() */
+
+ AV *const av = (AV*)msv;
+ const I32 maxarg = AvFILL(av) + 1;
+ SV **array;
+
+ if (oplist) {
+ assert(oplist->op_type == OP_PADAV
+ || oplist->op_type == OP_RV2AV);
+ oplist = oplist->op_sibling;;
+ }
+
+ if (SvRMAGICAL(av)) {
+ U32 i;
+
+ Newx(array, maxarg, SV*);
+ SAVEFREEPV(array);
+ for (i=0; i < (U32)maxarg; i++) {
+ SV ** const svp = av_fetch(av, i, FALSE);
+ array[i] = svp ? *svp : &PL_sv_undef;
+ }
+ }
+ else
+ array = AvARRAY(av);
+
+ pat = S_concat_pat(aTHX_ pRExC_state, pat,
+ array, maxarg, NULL, recompile_p,
+ /* $" */
+ GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
+
+ continue;
+ }
+
/* we make the assumption here that each op in the list of
* op_siblings maps to one SV pushed onto the stack,
}
if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
msv = SvRV(msv);
+
if (pat) {
/* this is a partially unrolled
* sv_catsv_nomg(pat, msv);
* that allows us to adjust code block indices if
* needed */
- STRLEN slen, dlen;
+ STRLEN dlen;
char *dst = SvPV_force_nomg(pat, dlen);
- const char *src = SvPV_flags_const(msv, slen, 0);
orig_patlen = dlen;
if (SvUTF8(msv) && !SvUTF8(pat)) {
- S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen);
+ S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
sv_setpvn(pat, dst, dlen);
SvUTF8_on(pat);
}
- sv_catpvn_nomg(pat, src, slen);
+ sv_catsv_nomg(pat, msv);
rx = msv;
}
else
pat = msv;
+
if (code)
pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
}
}
}
}
+ /* avoid calling magic multiple times on a single element e.g. =~ $qr */
+ if (alloced)
+ SvSETMAGIC(pat);
+
return pat;
}
I32 flags;
I32 minlen = 0;
U32 rx_flags;
- SV *pat = NULL;
+ SV *pat;
SV *code_blocksv = NULL;
SV** new_patternp = patternp;
I32 sawlookahead = 0;
I32 sawplus = 0;
I32 sawopen = 0;
+ I32 sawminmod = 0;
+
regex_charset initial_charset = get_regex_charset(orig_rx_flags);
bool recompile = 0;
bool runtime_code = 0;
expr = expr->op_sibling;
}
- if (pat_count > 1) {
- pat = newSVpvn("", 0);
- SAVEFREESV(pat);
- }
-
- pat = S_concat_pat(aTHX_ pRExC_state, pat, new_patternp, pat_count,
- expr, &recompile);
-
- if (pat_count > 1)
- SvSETMAGIC(pat);
+ pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
+ expr, &recompile, NULL);
/* handle bare (possibly after overloading) regex: foo =~ $re */
{
if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
/* whoops, we have a non-utf8 pattern, whilst run-time code
* got compiled as utf8. Try again with a utf8 pattern */
- S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen);
+ S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
+ pRExC_state->num_code_blocks);
goto redo_first_pass;
}
}
RExC_npar = 1;
RExC_nestroot = 0;
RExC_size = 0L;
- RExC_emit = &PL_regdummy;
+ RExC_emit = &RExC_emit_dummy;
RExC_whilem_seen = 0;
RExC_open_parens = NULL;
RExC_close_parens = NULL;
thing.
-- dmq */
if (flags & RESTART_UTF8) {
- S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen);
+ S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
+ pRExC_state->num_code_blocks);
goto redo_first_pass;
}
- Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#X", flags);
+ Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
}
if (code_blocksv)
SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
REGC((U8)REG_MAGIC, (char*) RExC_emit++);
if (reg(pRExC_state, 0, &flags,1) == NULL) {
ReREFCNT_dec(rx);
- Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#X", flags);
+ Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
}
/* XXXX To minimize changes to RE engine we always allocate
3-units-long substrs field. */
}
reStudy:
- r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
+ r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
Zero(r->substrs, 1, struct reg_substr_data);
#ifdef TRIE_STUDY_OPT
* the only op that could be a regnode is PLUS, all the rest
* will be regnode_1 or regnode_2.
*
+ * (yves doesn't think this is true)
*/
if (OP(first) == PLUS)
sawplus = 1;
- else
+ else {
+ if (OP(first) == MINMOD)
+ sawminmod = 1;
first += regarglen[OP(first)];
-
+ }
first = NEXTOPER(first);
first_next= regnext(first);
}
first = NEXTOPER(first);
goto again;
}
- if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
+ if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback)
&& !pRExC_state->num_code_blocks) /* May examine pos and $& */
/* x+ must match at the 1st pos of run of x's */
r->intflags |= PREGf_SKIP;
minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
&data, -1, NULL, NULL,
- SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
+ SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
+ | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
+ 0);
CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
&& data.last_start_min == 0 && data.last_end > 0
&& !RExC_seen_zerolen
&& !(RExC_seen & REG_SEEN_VERBARG)
- && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
+ && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
r->extflags |= RXf_CHECK_ALL;
scan_commit(pRExC_state, &data,&minlen,0);
r->check_offset_min = r->float_min_offset;
r->check_offset_max = r->float_max_offset;
}
- /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
- This should be changed ASAP! */
- if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
+ if ((r->check_substr || r->check_utf8) ) {
r->extflags |= RXf_USE_INTUIT;
if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
r->extflags |= RXf_INTUIT_TAIL;
minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
- &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
+ &data, -1, NULL, NULL,
+ SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS
+ |(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
+ 0);
CHECK_RESTUDY_GOTO_butfirst(NOOP);
PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
- if ( ( n == RX_BUFF_IDX_CARET_PREMATCH
+ if ( n == RX_BUFF_IDX_CARET_PREMATCH
|| n == RX_BUFF_IDX_CARET_FULLMATCH
|| n == RX_BUFF_IDX_CARET_POSTMATCH
- )
- && !(rx->extflags & RXf_PMf_KEEPCOPY)
- )
- goto ret_undef;
+ )
+ {
+ bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
+ if (!keepcopy) {
+ /* on something like
+ * $r = qr/.../;
+ * /$qr/p;
+ * the KEEPCOPY is set on the PMOP rather than the regex */
+ if (PL_curpm && r == PM_GETRE(PL_curpm))
+ keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
+ }
+ if (!keepcopy)
+ goto ret_undef;
+ }
if (!rx->subbeg)
goto ret_undef;
PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
+ if ( paren == RX_BUFF_IDX_CARET_PREMATCH
+ || paren == RX_BUFF_IDX_CARET_FULLMATCH
+ || paren == RX_BUFF_IDX_CARET_POSTMATCH
+ )
+ {
+ bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
+ if (!keepcopy) {
+ /* on something like
+ * $r = qr/.../;
+ * /$qr/p;
+ * the KEEPCOPY is set on the PMOP rather than the regex */
+ if (PL_curpm && r == PM_GETRE(PL_curpm))
+ keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
+ }
+ if (!keepcopy)
+ goto warn_undef;
+ }
+
/* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
switch (paren) {
case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
- if (!(rx->extflags & RXf_PMf_KEEPCOPY))
- goto warn_undef;
- /*FALLTHROUGH*/
-
case RX_BUFF_IDX_PREMATCH: /* $` */
if (rx->offs[0].start != -1) {
i = rx->offs[0].start;
return 0;
case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
- if (!(rx->extflags & RXf_PMf_KEEPCOPY))
- goto warn_undef;
case RX_BUFF_IDX_POSTMATCH: /* $' */
if (rx->offs[0].end != -1) {
i = rx->sublen - rx->offs[0].end;
}
return 0;
- case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
- if (!(rx->extflags & RXf_PMf_KEEPCOPY))
- goto warn_undef;
- /*FALLTHROUGH*/
-
- /* $& / ${^MATCH}, $1, $2, ... */
- default:
+ default: /* $& / ${^MATCH}, $1, $2, ... */
if (paren <= (I32)rx->nparens &&
(s1 = rx->offs[paren].start) != -1 &&
(t1 = rx->offs[paren].end) != -1)
/* This section of code defines the inversion list object and its methods. The
* interfaces are highly subject to change, so as much as possible is static to
* this file. An inversion list is here implemented as a malloc'd C UV array
- * with some added info that is placed as UVs at the beginning in a header
- * portion. An inversion list for Unicode is an array of code points, sorted
- * by ordinal number. The zeroth element is the first code point in the list.
- * The 1th element is the first element beyond that not in the list. In other
- * words, the first range is
+ * as an SVt_INVLIST scalar.
+ *
+ * An inversion list for Unicode is an array of code points, sorted by ordinal
+ * number. The zeroth element is the first code point in the list. The 1th
+ * element is the first element beyond that not in the list. In other words,
+ * the first range is
* invlist[0]..(invlist[1]-1)
* The other ranges follow. Thus every element whose index is divisible by two
* marks the beginning of a range that is in the list, and every element not
* Taking the complement (inverting) an inversion list is quite simple, if the
* first element is 0, remove it; otherwise add a 0 element at the beginning.
* This implementation reserves an element at the beginning of each inversion
- * list to contain 0 when the list contains 0, and contains 1 otherwise. The
- * actual beginning of the list is either that element if 0, or the next one if
- * 1.
+ * list to always contain 0; there is an additional flag in the header which
+ * indicates if the list begins at the 0, or is offset to begin at the next
+ * element.
*
* More about inversion lists can be found in "Unicode Demystified"
* Chapter 13 by Richard Gillam, published by Addison-Wesley.
* should eventually be made public */
/* The header definitions are in F<inline_invlist.c> */
-#define TO_INTERNAL_SIZE(x) (((x) + HEADER_LENGTH) * sizeof(UV))
-#define FROM_INTERNAL_SIZE(x) (((x)/ sizeof(UV)) - HEADER_LENGTH)
-
-#define INVLIST_INITIAL_LEN 10
PERL_STATIC_INLINE UV*
S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
{
/* Returns a pointer to the first element in the inversion list's array.
* This is called upon initialization of an inversion list. Where the
- * array begins depends on whether the list has the code point U+0000
- * in it or not. The other parameter tells it whether the code that
- * follows this call is about to put a 0 in the inversion list or not.
- * The first element is either the element with 0, if 0, or the next one,
- * if 1 */
+ * array begins depends on whether the list has the code point U+0000 in it
+ * or not. The other parameter tells it whether the code that follows this
+ * call is about to put a 0 in the inversion list or not. The first
+ * element is either the element reserved for 0, if TRUE, or the element
+ * after it, if FALSE */
- UV* zero = get_invlist_zero_addr(invlist);
+ bool* offset = get_invlist_offset_addr(invlist);
+ UV* zero_addr = (UV *) SvPVX(invlist);
PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
/* Must be empty */
- assert(! *_get_invlist_len_addr(invlist));
+ assert(! _invlist_len(invlist));
+
+ *zero_addr = 0;
/* 1^1 = 0; 1^0 = 1 */
- *zero = 1 ^ will_have_0;
- return zero + *zero;
+ *offset = 1 ^ will_have_0;
+ return zero_addr + *offset;
}
PERL_STATIC_INLINE UV*
/* Must not be empty. If these fail, you probably didn't check for <len>
* being non-zero before trying to get the array */
- assert(*_get_invlist_len_addr(invlist));
- assert(*get_invlist_zero_addr(invlist) == 0
- || *get_invlist_zero_addr(invlist) == 1);
-
- /* The array begins either at the element reserved for zero if the
- * list contains 0 (that element will be set to 0), or otherwise the next
- * element (in which case the reserved element will be set to 1). */
- return (UV *) (get_invlist_zero_addr(invlist)
- + *get_invlist_zero_addr(invlist));
+ assert(_invlist_len(invlist));
+
+ /* The very first element always contains zero, The array begins either
+ * there, or if the inversion list is offset, at the element after it.
+ * The offset header field determines which; it contains 0 or 1 to indicate
+ * how much additionally to add */
+ assert(0 == *(SvPVX(invlist)));
+ return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
}
PERL_STATIC_INLINE void
-S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
+S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
{
- /* Sets the current number of elements stored in the inversion list */
+ /* Sets the current number of elements stored in the inversion list.
+ * Updates SvCUR correspondingly */
PERL_ARGS_ASSERT_INVLIST_SET_LEN;
- *_get_invlist_len_addr(invlist) = len;
-
- assert(len <= SvLEN(invlist));
-
- SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
- /* If the list contains U+0000, that element is part of the header,
- * and should not be counted as part of the array. It will contain
- * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
- * subtract:
- * SvCUR_set(invlist,
- * TO_INTERNAL_SIZE(len
- * - (*get_invlist_zero_addr(inv_list) ^ 1)));
- * But, this is only valid if len is not 0. The consequences of not doing
- * this is that the memory allocation code may think that 1 more UV is
- * being used than actually is, and so might do an unnecessary grow. That
- * seems worth not bothering to make this the precise amount.
- *
- * Note that when inverting, SvCUR shouldn't change */
+ assert(SvTYPE(invlist) == SVt_INVLIST);
+
+ SvCUR_set(invlist,
+ (len == 0)
+ ? 0
+ : TO_INTERNAL_SIZE(len + offset));
+ assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
}
PERL_STATIC_INLINE IV*
S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
{
- /* Return the address of the UV that is reserved to hold the cached index
+ /* Return the address of the IV that is reserved to hold the cached index
* */
PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
- return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
+ assert(SvTYPE(invlist) == SVt_INVLIST);
+
+ return &(((XINVLIST*) SvANY(invlist))->prev_index);
}
PERL_STATIC_INLINE IV
PERL_ARGS_ASSERT_INVLIST_MAX;
- return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
- ? _invlist_len(invlist)
- : FROM_INTERNAL_SIZE(SvLEN(invlist));
-}
-
-PERL_STATIC_INLINE UV*
-S_get_invlist_zero_addr(pTHX_ SV* invlist)
-{
- /* Return the address of the UV that is reserved to hold 0 if the inversion
- * list contains 0. This has to be the last element of the heading, as the
- * list proper starts with either it if 0, or the next element if not.
- * (But we force it to contain either 0 or 1) */
+ assert(SvTYPE(invlist) == SVt_INVLIST);
- PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
-
- return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
+ /* Assumes worst case, in which the 0 element is not counted in the
+ * inversion list, so subtracts 1 for that */
+ return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
+ ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
+ : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
}
#ifndef PERL_IN_XSUB_RE
SV* new_list;
if (initial_size < 0) {
- initial_size = INVLIST_INITIAL_LEN;
+ initial_size = 10;
}
/* Allocate the initial space */
- new_list = newSV(TO_INTERNAL_SIZE(initial_size));
- invlist_set_len(new_list, 0);
+ new_list = newSV_type(SVt_INVLIST);
- /* Force iterinit() to be used to get iteration to work */
- *get_invlist_iter_addr(new_list) = UV_MAX;
+ /* First 1 is in case the zero element isn't in the list; second 1 is for
+ * trailing NUL */
+ SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
+ invlist_set_len(new_list, 0, 0);
- /* This should force a segfault if a method doesn't initialize this
- * properly */
- *get_invlist_zero_addr(new_list) = UV_MAX;
+ /* Force iterinit() to be used to get iteration to work */
+ *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
*get_invlist_previous_index_addr(new_list) = 0;
- *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
-#if HEADER_LENGTH != 5
-# error Need to regenerate INVLIST_VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
-#endif
return new_list;
}
#endif
STATIC SV*
-S__new_invlist_C_array(pTHX_ UV* list)
+S__new_invlist_C_array(pTHX_ const UV* const list)
{
/* Return a pointer to a newly constructed inversion list, initialized to
* point to <list>, which has to be in the exact correct inversion list
* form, including internal fields. Thus this is a dangerous routine that
- * should not be used in the wrong hands */
+ * should not be used in the wrong hands. The passed in 'list' contains
+ * several header fields at the beginning that are not part of the
+ * inversion list body proper */
+
+ const STRLEN length = (STRLEN) list[0];
+ const UV version_id = list[1];
+ const bool offset = cBOOL(list[2]);
+#define HEADER_LENGTH 3
+ /* If any of the above changes in any way, you must change HEADER_LENGTH
+ * (if appropriate) and regenerate INVLIST_VERSION_ID by running
+ * perl -E 'say int(rand 2**31-1)'
+ */
+#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
+ data structure type, so that one being
+ passed in can be validated to be an
+ inversion list of the correct vintage.
+ */
- SV* invlist = newSV_type(SVt_PV);
+ SV* invlist = newSV_type(SVt_INVLIST);
PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
- SvPV_set(invlist, (char *) list);
+ if (version_id != INVLIST_VERSION_ID) {
+ Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
+ }
+
+ /* The generated array passed in includes header elements that aren't part
+ * of the list proper, so start it just after them */
+ SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
+
SvLEN_set(invlist, 0); /* Means we own the contents, and the system
shouldn't touch it */
- SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
- if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
- Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
- }
+ *(get_invlist_offset_addr(invlist)) = offset;
+
+ /* The 'length' passed to us is the physical number of elements in the
+ * inversion list. But if there is an offset the logical number is one
+ * less than that */
+ invlist_set_len(invlist, length - offset, offset);
- /* Initialize the iteration pointer.
- * XXX This could be done at compile time in charclass_invlists.h, but I
- * (khw) am not confident that the suffixes for specifying the C constant
- * UV_MAX are portable, e.g. 'ull' on a 32 bit machine that is configured
- * to use 64 bits; might need a Configure probe */
+ invlist_set_previous_index(invlist, 0);
+
+ /* Initialize the iteration pointer. */
invlist_iterfinish(invlist);
return invlist;
PERL_ARGS_ASSERT_INVLIST_EXTEND;
- SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
+ assert(SvTYPE(invlist) == SVt_INVLIST);
+
+ /* Add one to account for the zero element at the beginning which may not
+ * be counted by the calling parameters */
+ SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
}
PERL_STATIC_INLINE void
{
PERL_ARGS_ASSERT_INVLIST_TRIM;
+ assert(SvTYPE(invlist) == SVt_INVLIST);
+
/* Change the length of the inversion list to how many entries it currently
* has */
-
SvPV_shrink_to_cur((SV *) invlist);
}
UV* array;
UV max = invlist_max(invlist);
UV len = _invlist_len(invlist);
+ bool offset;
PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
if (len == 0) { /* Empty lists must be initialized */
- array = _invlist_array_init(invlist, start == 0);
+ offset = start != 0;
+ array = _invlist_array_init(invlist, ! offset);
}
else {
/* Here, the existing list is non-empty. The current max entry in the
* value not in the set, it is extending the set, so the new first
* value not in the set is one greater than the newly extended range.
* */
+ offset = *get_invlist_offset_addr(invlist);
if (array[final_element] == start) {
if (end != UV_MAX) {
array[final_element] = end + 1;
else {
/* But if the end is the maximum representable on the machine,
* just let the range that this would extend to have no end */
- invlist_set_len(invlist, len - 1);
+ invlist_set_len(invlist, len - 1, offset);
}
return;
}
len += 2; /* Includes an element each for the start and end of range */
- /* If overflows the existing space, extend, which may cause the array to be
- * moved */
+ /* If wll overflow the existing space, extend, which may cause the array to
+ * be moved */
if (max < len) {
invlist_extend(invlist, len);
- invlist_set_len(invlist, len); /* Have to set len here to avoid assert
- failure in invlist_array() */
+
+ /* Have to set len here to avoid assert failure in invlist_array() */
+ invlist_set_len(invlist, len, offset);
+
array = invlist_array(invlist);
}
else {
- invlist_set_len(invlist, len);
+ invlist_set_len(invlist, len, offset);
}
/* The next item on the list starts the range, the one after that is
else {
/* But if the end is the maximum representable on the machine, just let
* the range have no end */
- invlist_set_len(invlist, len - 1);
+ invlist_set_len(invlist, len - 1, offset);
}
}
}
void
-Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
+Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output)
{
/* Take the union of two inversion lists and point <output> to it. *output
* SHOULD BE DEFINED upon input, and if it points to one of the two lists,
* return the larger of the input lists, but then outside code might need
* to keep track of whether to free the input list or not */
- UV* array_a; /* a's array */
- UV* array_b;
+ const UV* array_a; /* a's array */
+ const UV* array_b;
UV len_a; /* length of a's array */
UV len_b;
if (complement_b) {
/* To complement, we invert: if the first element is 0, remove it. To
- * do this, we just pretend the array starts one later, and clear the
- * flag as we don't have to do anything else later */
+ * do this, we just pretend the array starts one later */
if (array_b[0] == 0) {
array_b++;
len_b--;
- complement_b = FALSE;
}
else {
- /* But if the first element is not zero, we unshift a 0 before the
- * array. The data structure reserves a space for that 0 (which
- * should be a '1' right now), so physical shifting is unneeded,
- * but temporarily change that element to 0. Before exiting the
- * routine, we must restore the element to '1' */
+ /* But if the first element is not zero, we pretend the list starts
+ * at the 0 that is always stored immediately before the array. */
array_b--;
len_b++;
- array_b[0] = 0;
}
}
/* Set result to final length, which can change the pointer to array_u, so
* re-find it */
if (len_u != _invlist_len(u)) {
- invlist_set_len(u, len_u);
+ invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
invlist_trim(u);
array_u = invlist_array(u);
}
}
}
- /* If we've changed b, restore it */
- if (complement_b) {
- array_b[0] = 1;
- }
-
/* We may be removing a reference to one of the inputs */
if (a == *output || b == *output) {
assert(! invlist_is_iterating(*output));
}
void
-Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
+Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i)
{
/* Take the intersection of two inversion lists and point <i> to it. *i
* SHOULD BE DEFINED upon input, and if it points to one of the two lists,
* union above
*/
- UV* array_a; /* a's array */
- UV* array_b;
+ const UV* array_a; /* a's array */
+ const UV* array_b;
UV len_a; /* length of a's array */
UV len_b;
assert(a != b);
/* Special case if either one is empty */
- len_a = _invlist_len(a);
+ len_a = (a == NULL) ? 0 : _invlist_len(a);
if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
if (len_a != 0 && complement_b) {
* must be every possible code point. Thus the intersection is
* simply 'a'. */
if (*i != a) {
- *i = invlist_clone(a);
-
if (*i == b) {
SvREFCNT_dec_NN(b);
}
+
+ *i = invlist_clone(a);
}
/* else *i is already 'a' */
return;
if (complement_b) {
/* To complement, we invert: if the first element is 0, remove it. To
- * do this, we just pretend the array starts one later, and clear the
- * flag as we don't have to do anything else later */
+ * do this, we just pretend the array starts one later */
if (array_b[0] == 0) {
array_b++;
len_b--;
- complement_b = FALSE;
}
else {
- /* But if the first element is not zero, we unshift a 0 before the
- * array. The data structure reserves a space for that 0 (which
- * should be a '1' right now), so physical shifting is unneeded,
- * but temporarily change that element to 0. Before exiting the
- * routine, we must restore the element to '1' */
+ /* But if the first element is not zero, we pretend the list starts
+ * at the 0 that is always stored immediately before the array. */
array_b--;
len_b++;
- array_b[0] = 0;
}
}
/* Set result to final length, which can change the pointer to array_r, so
* re-find it */
if (len_r != _invlist_len(r)) {
- invlist_set_len(r, len_r);
+ invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
invlist_trim(r);
array_r = invlist_array(r);
}
}
}
- /* If we've changed b, restore it */
- if (complement_b) {
- array_b[0] = 1;
- }
-
/* We may be removing a reference to one of the inputs */
if (a == *i || b == *i) {
assert(! invlist_is_iterating(*i));
* have a zero; removes it otherwise. As described above, the data
* structure is set up so that this is very efficient */
- UV* len_pos = _get_invlist_len_addr(invlist);
-
PERL_ARGS_ASSERT__INVLIST_INVERT;
assert(! invlist_is_iterating(invlist));
/* The inverse of matching nothing is matching everything */
- if (*len_pos == 0) {
+ if (_invlist_len(invlist) == 0) {
_append_range_to_invlist(invlist, 0, UV_MAX);
return;
}
- /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
- * zero element was a 0, so it is being removed, so the length decrements
- * by 1; and vice-versa. SvCUR is unaffected */
- if (*get_invlist_zero_addr(invlist) ^= 1) {
- (*len_pos)--;
- }
- else {
- (*len_pos)++;
- }
+ *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
}
void
invlist_extend(invlist, len);
array = invlist_array(invlist);
}
- invlist_set_len(invlist, len);
+ invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist));
array[len - 1] = PERL_UNICODE_MAX + 1;
}
else { /* Remove the 0x110000 */
- invlist_set_len(invlist, len - 1);
+ invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist));
}
}
/* Need to allocate extra space to accommodate Perl's addition of a
* trailing NUL to SvPV's, since it thinks they are always strings */
SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
- STRLEN length = SvCUR(invlist);
+ STRLEN physical_length = SvCUR(invlist);
+ bool offset = *(get_invlist_offset_addr(invlist));
PERL_ARGS_ASSERT_INVLIST_CLONE;
- SvCUR_set(new_invlist, length); /* This isn't done automatically */
- Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
+ *(get_invlist_offset_addr(new_invlist)) = offset;
+ invlist_set_len(new_invlist, _invlist_len(invlist), offset);
+ Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
return new_invlist;
}
-PERL_STATIC_INLINE UV*
+PERL_STATIC_INLINE STRLEN*
S_get_invlist_iter_addr(pTHX_ SV* invlist)
{
/* Return the address of the UV that contains the current iteration
PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
- return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
-}
+ assert(SvTYPE(invlist) == SVt_INVLIST);
-PERL_STATIC_INLINE UV*
-S_get_invlist_version_id_addr(pTHX_ SV* invlist)
-{
- /* Return the address of the UV that contains the version id. */
-
- PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
-
- return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
+ return &(((XINVLIST*) SvANY(invlist))->iterator);
}
PERL_STATIC_INLINE void
PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
- *get_invlist_iter_addr(invlist) = UV_MAX;
+ *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
}
STATIC bool
* <*start> and <*end> are unchanged, and the next call to this function
* will start over at the beginning of the list */
- UV* pos = get_invlist_iter_addr(invlist);
+ STRLEN* pos = get_invlist_iter_addr(invlist);
UV len = _invlist_len(invlist);
UV *array;
PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
if (*pos >= len) {
- *pos = UV_MAX; /* Force iterinit() to be required next time */
+ *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
return FALSE;
}
{
PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
- return *(get_invlist_iter_addr(invlist)) < UV_MAX;
+ return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
}
PERL_STATIC_INLINE UV
}
#endif
-#if 0
+#ifdef PERL_ARGS_ASSERT__INVLISTEQ
bool
-S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
+S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
{
/* Return a boolean as to if the two passed in inversion lists are
* identical. The final argument, if TRUE, says to take the complement of
* the second inversion list before doing the comparison */
- UV* array_a = invlist_array(a);
- UV* array_b = invlist_array(b);
+ const UV* array_a = invlist_array(a);
+ const UV* array_b = invlist_array(b);
UV len_a = _invlist_len(a);
UV len_b = _invlist_len(b);
/* Otherwise, to complement, we invert. Here, the first element is
* 0, just remove it. To do this, we just pretend the array starts
- * one later, and clear the flag as we don't have to do anything
- * else later */
+ * one later */
array_b++;
len_b--;
- complement_b = FALSE;
}
else {
- /* But if the first element is not zero, we unshift a 0 before the
- * array. The data structure reserves a space for that 0 (which
- * should be a '1' right now), so physical shifting is unneeded,
- * but temporarily change that element to 0. Before exiting the
- * routine, we must restore the element to '1' */
+ /* But if the first element is not zero, we pretend the list starts
+ * at the 0 that is always stored immediately before the array. */
array_b--;
len_b++;
- array_b[0] = 0;
}
}
}
}
- if (complement_b) {
- array_b[0] = 1;
- }
return retval;
}
#endif
#undef HEADER_LENGTH
-#undef INVLIST_INITIAL_LENGTH
#undef TO_INTERNAL_SIZE
#undef FROM_INTERNAL_SIZE
-#undef INVLIST_LEN_OFFSET
-#undef INVLIST_ZERO_OFFSET
-#undef INVLIST_ITER_OFFSET
#undef INVLIST_VERSION_ID
-#undef INVLIST_PREVIOUS_INDEX_OFFSET
/* End of inversion list object */
#define WASTED_O 0x01
#define WASTED_G 0x02
#define WASTED_C 0x04
-#define WASTED_GC (0x02|0x04)
+#define WASTED_GC (WASTED_G|WASTED_C)
I32 wastedflags = 0x00;
U32 posflags = 0, negflags = 0;
U32 *flagsp = &posflags;
const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
if (! (wastedflags & wflagbit) ) {
wastedflags |= wflagbit;
+ /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
vWARN5(
RExC_parse + 1,
"Useless (%s%c) - %suse /%c modifier",
if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
if (! (wastedflags & WASTED_C) ) {
wastedflags |= WASTED_GC;
+ /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
vWARN3(
RExC_parse + 1,
"Useless (%sc) - %suse /gc modifier",
cannot happen. */
STATIC regnode *
S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
- /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
+ /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
+ * 2 is like 1, but indicates that nextchar() has been called to advance
+ * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
+ * this flag alerts us to the need to check for that */
{
dVAR;
regnode *ret; /* Will be the head of the group. */
/* Make an OPEN node, if parenthesized. */
if (paren) {
+
+ /* Under /x, space and comments can be gobbled up between the '(' and
+ * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
+ * intervening space, as the sequence is a token, and a token should be
+ * indivisible */
+ bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
+
if ( *RExC_parse == '*') { /* (*VERB:ARG) */
char *start_verb = RExC_parse;
STRLEN verb_len = 0;
unsigned char op = 0;
int argok = 1;
int internal_argval = 0; /* internal_argval is only useful if !argok */
+
+ if (has_intervening_patws && SIZE_ONLY) {
+ ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
+ }
while ( *RExC_parse && *RExC_parse != ')' ) {
if ( *RExC_parse == ':' ) {
start_arg = RExC_parse + 1;
}
nextchar(pRExC_state);
return ret;
- } else
- if (*RExC_parse == '?') { /* (?...) */
+ }
+ else if (*RExC_parse == '?') { /* (?...) */
bool is_logical = 0;
const char * const seqstart = RExC_parse;
+ if (has_intervening_patws && SIZE_ONLY) {
+ ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
+ }
RExC_parse++;
paren = *RExC_parse++;
*flagp |= HASWIDTH;
Set_Node_Offset(ret, parse_start+1);
- Set_Node_Cur_Length(ret); /* MJD */
+ Set_Node_Cur_Length(ret, parse_start);
nextchar(pRExC_state);
return ret;
case '@': /* (?@...) */
vFAIL2("Sequence (?%c...) not implemented", (int)paren);
break;
+ case '#': /* (?#...) */
+ /* XXX As soon as we disallow separating the '?' and '*' (by
+ * spaces or (?#...) comment), it is believed that this case
+ * will be unreachable and can be removed. See
+ * [perl #117327] */
+ while (*RExC_parse && *RExC_parse != ')')
+ RExC_parse++;
+ if (*RExC_parse != ')')
+ FAIL("Sequence (?#... not terminated");
+ nextchar(pRExC_state);
+ *flagp = TRYAGAIN;
+ return NULL;
case '0' : /* (?0) */
case 'R' : /* (?R) */
if (*RExC_parse != ')')
*flagp = RESTART_UTF8;
return NULL;
}
- FAIL2("panic: regbranch returned NULL, flags=%#X",
- flags);
+ FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
+ (UV) flags);
} else
REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
c = *nextchar(pRExC_state);
*flagp = RESTART_UTF8;
return NULL;
}
- FAIL2("panic: regbranch returned NULL, flags=%#X",
- flags);
+ FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
+ (UV) flags);
}
REGTAIL(pRExC_state, ret, lastbr);
if (flags&HASWIDTH)
*flagp = RESTART_UTF8;
return NULL;
}
- FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
+ FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
}
if (*RExC_parse == '|') {
if (!SIZE_ONLY && RExC_extralen) {
*flagp = RESTART_UTF8;
return NULL;
}
- FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
+ FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
}
REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
lastbr = br;
case ':':
ender = reg_node(pRExC_state, TAIL);
break;
- case 1:
+ case 1: case 2:
ender = reganode(pRExC_state, CLOSE, parno);
if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
if (paren == '>')
node = SUSPEND, flag = 0;
reginsert(pRExC_state, node,ret, depth+1);
- Set_Node_Cur_Length(ret);
+ Set_Node_Cur_Length(ret, parse_start);
Set_Node_Offset(ret, parse_start + 1);
ret->flags = flag;
REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
/* Check for proper termination. */
if (paren) {
- RExC_flags = oregflags;
+ /* restore original flags, but keep (?p) */
+ RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
RExC_parse = oregcomp_parse;
vFAIL("Unmatched (");
*flagp = RESTART_UTF8;
return NULL;
}
- FAIL2("panic: regpiece returned NULL, flags=%#X", flags);
+ FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
}
else if (ret == NULL)
ret = latest;
if (flags & (TRYAGAIN|RESTART_UTF8))
*flagp |= flags & (TRYAGAIN|RESTART_UTF8);
else
- FAIL2("panic: regatom returned NULL, flags=%#X", flags);
+ FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
return(NULL);
}
ret = reg_node(pRExC_state, OPFAIL);
return ret;
}
- else if (max == 0) { /* replace {0} with a nothing node */
- if (SIZE_ONLY) {
- RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING];
- }
- else {
- RExC_emit = orig_emit;
- }
- ret = reg_node(pRExC_state, NOTHING);
- return ret;
- }
do_curly:
if ((flags&SIMPLE)) {
RExC_naughty += 2 + RExC_naughty / 2;
reginsert(pRExC_state, CURLY, ret, depth+1);
Set_Node_Offset(ret, parse_start+1); /* MJD */
- Set_Node_Cur_Length(ret);
+ Set_Node_Cur_Length(ret, parse_start);
}
else {
regnode * const w = reg_node(pRExC_state, WHILEM);
reginsert(pRExC_state, MINMOD, ret, depth+1);
REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
}
-#ifndef REG_ALLOW_MINMOD_SUSPEND
else
-#endif
if (RExC_parse < RExC_end && *RExC_parse == '+') {
regnode *ender;
nextchar(pRExC_state);
ret->flags = 0;
ender = reg_node(pRExC_state, TAIL);
REGTAIL(pRExC_state, ret, ender);
- /*ret= ender;*/
}
if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
*flagp = RESTART_UTF8;
return FALSE;
}
- FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#X",
- flags);
+ FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
+ (UV) flags);
}
*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
* additionally will populate the node's STRING with <code_point>, if <len>
* is 0. In both cases <*flagp> is appropriately set
*
- * It knows that under FOLD, UTF characters and the Latin Sharp S must be
- * folded (the latter only when the rules indicate it can match 'ss') */
+ * It knows that under FOLD, the Latin Sharp S and UTF characters above
+ * 255, must be folded (the former only when the rules indicate it can
+ * match 'ss') */
bool len_passed_in = cBOOL(len != 0);
U8 character[UTF8_MAXBYTES_CASE+1];
if (! len_passed_in) {
if (UTF) {
- if (FOLD) {
- to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
+ if (FOLD && (! LOC || code_point > 255)) {
+ _to_uni_fold_flags(NATIVE_TO_UNI(code_point),
+ character,
+ &len,
+ FOLD_FLAGS_FULL | ((LOC)
+ ? FOLD_FLAGS_LOCALE
+ : (ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0));
}
else {
uvchr_to_utf8( character, code_point);
if (ret == NULL) {
if (*flagp & RESTART_UTF8)
return NULL;
- FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
- *flagp);
+ FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
+ (UV) *flagp);
}
nextchar(pRExC_state);
Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
}
case '(':
nextchar(pRExC_state);
- ret = reg(pRExC_state, 1, &flags,depth+1);
+ ret = reg(pRExC_state, 2, &flags,depth+1);
if (ret == NULL) {
if (flags & TRYAGAIN) {
if (RExC_parse == RExC_end) {
*flagp = RESTART_UTF8;
return NULL;
}
- FAIL2("panic: reg returned NULL to regatom, flags=%#X", flags);
+ FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags);
}
*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
break;
/* regclass() can only return RESTART_UTF8 if multi-char folds
are allowed. */
if (!ret)
- FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
- *flagp);
+ FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
+ (UV) *flagp);
RExC_parse--;
Set_Node_Offset(ret, parse_start + 2);
- Set_Node_Cur_Length(ret);
+ Set_Node_Cur_Length(ret, parse_start);
nextchar(pRExC_state);
}
break;
/* override incorrect value set in reganode MJD */
Set_Node_Offset(ret, parse_start+1);
- Set_Node_Cur_Length(ret); /* MJD */
+ Set_Node_Cur_Length(ret, parse_start);
nextchar(pRExC_state);
}
goto parse_named_seq;
} }
num = atoi(RExC_parse);
- if (isg && num == 0)
- vFAIL("Reference to invalid group 0");
+ if (isg && num == 0) {
+ if (*RExC_parse == '0') {
+ vFAIL("Reference to invalid group 0");
+ }
+ else {
+ vFAIL("Unterminated \\g... pattern");
+ }
+ }
if (isrel) {
num = RExC_npar - num;
if (num < 1)
vFAIL("Reference to nonexistent or unclosed group");
}
- if (!isg && num > 9 && num >= RExC_npar)
+ if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9')
/* Probably a character specified in octal, e.g. \35 */
goto defchar;
else {
+#ifdef RE_TRACK_PATTERN_OFFSETS
char * const parse_start = RExC_parse - 1; /* MJD */
+#endif
while (isDIGIT(*RExC_parse))
RExC_parse++;
- if (parse_start == RExC_parse - 1)
- vFAIL("Unterminated \\g... pattern");
if (hasbrace) {
if (*RExC_parse != '}')
vFAIL("Unterminated \\g{...} pattern");
/* override incorrect value set in reganode MJD */
Set_Node_Offset(ret, parse_start+1);
- Set_Node_Cur_Length(ret); /* MJD */
+ Set_Node_Cur_Length(ret, parse_start);
RExC_parse--;
nextchar(pRExC_state);
}
p++;
ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
break;
- case '0': case '1': case '2': case '3':case '4':
+ case '8': case '9': /* must be a backreference */
+ --p;
+ goto loopdone;
+ case '1': case '2': case '3':case '4':
case '5': case '6': case '7':
- if (*p == '0' ||
- (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
+ /* When we parse backslash escapes there is ambiguity between
+ * backreferences and octal escapes. Any escape from \1 - \9 is
+ * a backreference, any multi-digit escape which does not start with
+ * 0 and which when evaluated as decimal could refer to an already
+ * parsed capture buffer is a backslash. Anything else is octal.
+ *
+ * Note this implies that \118 could be interpreted as 118 OR as
+ * "\11" . "8" depending on whether there were 118 capture buffers
+ * defined already in the pattern.
+ */
+ if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar )
+ { /* Not to be treated as an octal constant, go
+ find backref */
+ --p;
+ goto loopdone;
+ }
+ case '0':
{
I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
STRLEN numlen = 3;
form_short_octal_warning(p, numlen));
}
}
- else { /* Not to be treated as an octal constant, go
- find backref */
- --p;
- goto loopdone;
- }
if (PL_encoding && ender < 0x100)
goto recode_encoding;
break;
if (! SIZE_ONLY
&& RExC_flags & RXf_PMf_EXTENDED
- && ckWARN(WARN_DEPRECATED)
+ && ckWARN_d(WARN_DEPRECATED)
&& is_PATWS_non_low(p, UTF))
{
vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
goto loopdone;
}
- if (FOLD) {
- if (UTF
- /* See comments for join_exact() as to why we fold
- * this non-UTF at compile time */
- || (node_type == EXACTFU
- && ender == LATIN_SMALL_LETTER_SHARP_S))
- {
-
-
- /* Prime the casefolded buffer. Locale rules, which
- * apply only to code points < 256, aren't known until
- * execution, so for them, just output the original
- * character using utf8. If we start to fold non-UTF
- * patterns, be sure to update join_exact() */
- if (LOC && ender < 256) {
- if (UNI_IS_INVARIANT(ender)) {
- *s = (U8) ender;
- foldlen = 1;
- } else {
- *s = UTF8_TWO_BYTE_HI(ender);
- *(s + 1) = UTF8_TWO_BYTE_LO(ender);
- foldlen = 2;
- }
+ if (! FOLD) {
+ if (UTF) {
+ const STRLEN unilen = reguni(pRExC_state, ender, s);
+ if (unilen > 0) {
+ s += unilen;
+ len += unilen;
}
- else {
- UV folded = _to_uni_fold_flags(
- ender,
- (U8 *) s,
- &foldlen,
- FOLD_FLAGS_FULL
- | ((LOC) ? FOLD_FLAGS_LOCALE
- : (ASCII_FOLD_RESTRICTED)
- ? FOLD_FLAGS_NOMIX_ASCII
- : 0)
- );
- /* If this node only contains non-folding code
- * points so far, see if this new one is also
- * non-folding */
- if (maybe_exact) {
- if (folded != ender) {
- maybe_exact = FALSE;
+ /* The loop increments <len> each time, as all but this
+ * path (and one other) through it add a single byte to
+ * the EXACTish node. But this one has changed len to
+ * be the correct final value, so subtract one to
+ * cancel out the increment that follows */
+ len--;
+ }
+ else {
+ REGC((char)ender, s++);
+ }
+ }
+ else /* FOLD */
+ if (! ( UTF
+ /* See comments for join_exact() as to why we fold this
+ * non-UTF at compile time */
+ || (node_type == EXACTFU
+ && ender == LATIN_SMALL_LETTER_SHARP_S)))
+ {
+ *(s++) = (char) ender;
+ maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
+ }
+ else { /* UTF */
+
+ /* Prime the casefolded buffer. Locale rules, which apply
+ * only to code points < 256, aren't known until execution,
+ * so for them, just output the original character using
+ * utf8. If we start to fold non-UTF patterns, be sure to
+ * update join_exact() */
+ if (LOC && ender < 256) {
+ if (UNI_IS_INVARIANT(ender)) {
+ *s = (U8) ender;
+ foldlen = 1;
+ } else {
+ *s = UTF8_TWO_BYTE_HI(ender);
+ *(s + 1) = UTF8_TWO_BYTE_LO(ender);
+ foldlen = 2;
+ }
+ }
+ else {
+ UV folded = _to_uni_fold_flags(
+ ender,
+ (U8 *) s,
+ &foldlen,
+ FOLD_FLAGS_FULL
+ | ((LOC) ? FOLD_FLAGS_LOCALE
+ : (ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0)
+ );
+
+ /* If this node only contains non-folding code points
+ * so far, see if this new one is also non-folding */
+ if (maybe_exact) {
+ if (folded != ender) {
+ maybe_exact = FALSE;
+ }
+ else {
+ /* Here the fold is the original; we have
+ * to check further to see if anything
+ * folds to it */
+ if (! PL_utf8_foldable) {
+ SV* swash = swash_init("utf8",
+ "_Perl_Any_Folds",
+ &PL_sv_undef, 1, 0);
+ PL_utf8_foldable =
+ _get_swash_invlist(swash);
+ SvREFCNT_dec_NN(swash);
}
- else {
- /* Here the fold is the original; we have
- * to check further to see if anything
- * folds to it */
- if (! PL_utf8_foldable) {
- SV* swash = swash_init("utf8",
- "_Perl_Any_Folds",
- &PL_sv_undef, 1, 0);
- PL_utf8_foldable =
- _get_swash_invlist(swash);
- SvREFCNT_dec_NN(swash);
- }
- if (_invlist_contains_cp(PL_utf8_foldable,
- ender))
- {
- maybe_exact = FALSE;
- }
+ if (_invlist_contains_cp(PL_utf8_foldable,
+ ender))
+ {
+ maybe_exact = FALSE;
}
}
- ender = folded;
}
- s += foldlen;
-
- /* The loop increments <len> each time, as all but this
- * path (and the one just below for UTF) through it add
- * a single byte to the EXACTish node. But this one
- * has changed len to be the correct final value, so
- * subtract one to cancel out the increment that
- * follows */
- len += foldlen - 1;
- }
- else {
- *(s++) = (char) ender;
- maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
+ ender = folded;
}
+ s += foldlen;
+
+ /* The loop increments <len> each time, as all but this
+ * path (and one other) through it add a single byte to the
+ * EXACTish node. But this one has changed len to be the
+ * correct final value, so subtract one to cancel out the
+ * increment that follows */
+ len += foldlen - 1;
}
- else if (UTF) {
- const STRLEN unilen = reguni(pRExC_state, ender, s);
- if (unilen > 0) {
- s += unilen;
- len += unilen;
- }
-
- /* See comment just above for - 1 */
- len--;
- }
- else {
- REGC((char)ender, s++);
- }
if (next_is_quantifier) {
loopdone: /* Jumped to when encounters something that shouldn't be in
the node */
- /* If 'maybe_exact' is still set here, means there are no
- * code points in the node that participate in folds */
- if (FOLD && maybe_exact) {
- OP(ret) = EXACT;
- }
-
/* I (khw) don't know if you can get here with zero length, but the
* old code handled this situation by creating a zero-length EXACT
* node. Might as well be NOTHING instead */
OP(ret) = NOTHING;
}
else{
+
+ /* If 'maybe_exact' is still set here, means there are no
+ * code points in the node that participate in folds */
+ if (FOLD && maybe_exact) {
+ OP(ret) = EXACT;
+ }
alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
}
RExC_parse = p - 1;
- Set_Node_Cur_Length(ret); /* MJD */
+ Set_Node_Cur_Length(ret, parse_start);
nextchar(pRExC_state);
{
/* len is STRLEN which is unsigned, need to copy to signed */
* these things, we need to realize that something preceded by a backslash
* is escaped, so we have to keep track of backslashes */
if (SIZE_ONLY) {
+ UV depth = 0; /* how many nested (?[...]) constructs */
Perl_ck_warner_d(aTHX_
packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
RExC_parse = regpatws(pRExC_state, RExC_parse,
TRUE); /* means recognize comments */
switch (*RExC_parse) {
+ case '?':
+ if (RExC_parse[1] == '[') depth++, RExC_parse++;
+ /* FALL THROUGH */
default:
break;
case '\\':
FALSE, /* don't allow multi-char folds */
TRUE, /* silence non-portable warnings. */
¤t))
- FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
- *flagp);
+ FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
+ (UV) *flagp);
/* function call leaves parse pointing to the ']', except
* if we faked it */
}
case ']':
+ if (depth--) break;
RExC_parse++;
if (RExC_parse < RExC_end
&& *RExC_parse == ')')
* been parsed and evaluated to a single operand (or else is a syntax
* error), and is handled as a regular operand */
- stack = newAV();
+ sv_2mortal((SV *)(stack = newAV()));
while (RExC_parse < RExC_end) {
I32 top_index = av_tindex(stack);
FALSE, /* don't allow multi-char folds */
FALSE, /* don't silence non-portable warnings. */
¤t))
- FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
- *flagp);
+ FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
+ (UV) *flagp);
/* regclass() will return with parsing just the \ sequence,
* leaving the parse pointer at the next thing to parse */
RExC_parse--;
FALSE, /* don't allow multi-char folds */
FALSE, /* don't silence non-portable warnings. */
¤t))
- FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
- *flagp);
+ FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
+ (UV) *flagp);
/* function call leaves parse pointing to the ']', except if we
* faked it */
if (is_posix_class) {
|| IS_OPERAND(lparen)
|| SvUV(lparen) != '(')
{
+ SvREFCNT_dec(current);
RExC_parse++;
vFAIL("Unexpected ')'");
}
}
else {
SV* top = av_pop(stack);
+ SV *prev = NULL;
char current_operator;
if (IS_OPERAND(top)) {
+ SvREFCNT_dec_NN(top);
+ SvREFCNT_dec_NN(current);
vFAIL("Operand with no preceding operator");
}
current_operator = (char) SvUV(top);
goto handle_operand;
case '&':
- _invlist_intersection(av_pop(stack),
+ prev = av_pop(stack);
+ _invlist_intersection(prev,
current,
¤t);
av_push(stack, current);
case '|':
case '+':
- _invlist_union(av_pop(stack), current, ¤t);
+ prev = av_pop(stack);
+ _invlist_union(prev, current, ¤t);
av_push(stack, current);
break;
case '-':
- _invlist_subtract(av_pop(stack), current, ¤t);
+ prev = av_pop(stack);;
+ _invlist_subtract(prev, current, ¤t);
av_push(stack, current);
break;
SV* u = NULL;
SV* element;
- element = av_pop(stack);
- _invlist_union(element, current, &u);
- _invlist_intersection(element, current, &i);
+ prev = av_pop(stack);
+ _invlist_union(prev, current, &u);
+ _invlist_intersection(prev, current, &i);
+ /* _invlist_subtract will overwrite current
+ without freeing what it already contains */
+ element = current;
_invlist_subtract(u, i, ¤t);
av_push(stack, current);
SvREFCNT_dec_NN(i);
Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
}
SvREFCNT_dec_NN(top);
+ SvREFCNT_dec(prev);
}
}
RExC_end = save_end;
SvREFCNT_dec_NN(final);
SvREFCNT_dec_NN(result_string);
- SvREFCNT_dec_NN(stack);
nextchar(pRExC_state);
Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
/* <multi_char_matches> is actually an array of arrays.
* There will be one or two top-level elements: [2],
* and/or [3]. The [2] element is an array, each
- * element thereof is a character which folds to two
- * characters; likewise for [3]. (Unicode guarantees a
- * maximum of 3 characters in any fold.) When we
- * rewrite the character class below, we will do so
- * such that the longest folds are written first, so
- * that it prefers the longest matching strings first.
- * This is done even if it turns out that any
- * quantifier is non-greedy, out of programmer
- * laziness. Tom Christiansen has agreed that this is
- * ok. This makes the test for the ligature 'ffi' come
- * before the test for 'ff' */
+ * element thereof is a character which folds to TWO
+ * characters; [3] is for folds to THREE characters.
+ * (Unicode guarantees a maximum of 3 characters in any
+ * fold.) When we rewrite the character class below,
+ * we will do so such that the longest folds are
+ * written first, so that it prefers the longest
+ * matching strings first. This is done even if it
+ * turns out that any quantifier is non-greedy, out of
+ * programmer laziness. Tom Christiansen has agreed
+ * that this is ok. This makes the test for the
+ * ligature 'ffi' come before the test for 'ff' */
if (av_exists(multi_char_matches, cp_count)) {
this_array_ptr = (AV**) av_fetch(multi_char_matches,
cp_count, FALSE);
default:
/* Use deprecated warning to increase the
* chances of this being output */
- ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
+ ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
break;
}
}
* doesn't allow them between above and below 256 */
if ((ASCII_FOLD_RESTRICTED
&& (isASCII(c) != isASCII(j)))
- || (LOC && ((c < 256) != (j < 256))))
- {
+ || (LOC && c < 256)) {
continue;
}
if (ret_invlist) {
*ret_invlist = cp_list;
+ SvREFCNT_dec(swash);
/* Discard the generated node */
if (SIZE_ONLY) {
for (i = start; i <= (int) high; i++) {
if (! ANYOF_BITMAP_TEST(ret, i)) {
ANYOF_BITMAP_SET(ret, i);
- prevvalue = value;
- value = i;
}
}
}
- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
*/
#ifdef DEBUGGING
+
+static void
+S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
+{
+ int bit;
+ int set=0;
+
+ for (bit=0; bit<32; bit++) {
+ if (flags & (1<<bit)) {
+ if (!set++ && lead)
+ PerlIO_printf(Perl_debug_log, "%s",lead);
+ PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
+ }
+ }
+ if (lead) {
+ if (set)
+ PerlIO_printf(Perl_debug_log, "\n");
+ else
+ PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
+ }
+}
+
static void
S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
{
if (r->extflags & RXf_EVAL_SEEN)
PerlIO_printf(Perl_debug_log, "with eval ");
PerlIO_printf(Perl_debug_log, "\n");
- DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
+ DEBUG_FLAGS_r({
+ regdump_extflags("r->extflags: ",r->extflags);
+ regdump_intflags("r->intflags: ",r->intflags);
+ });
#else
PERL_ARGS_ASSERT_REGDUMP;
PERL_UNUSED_CONTEXT;
)
);
if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
- int i;
- int rangestart = -1;
- U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
sv_catpvs(sv, "[");
- for (i = 0; i <= 256; i++) {
- if (i < 256 && BITMAP_TEST(bitmap,i)) {
- if (rangestart == -1)
- rangestart = i;
- } else if (rangestart != -1) {
- if (i <= rangestart + 3)
- for (; rangestart < i; rangestart++)
- put_byte(sv, rangestart);
- else {
- put_byte(sv, rangestart);
- sv_catpvs(sv, "-");
- put_byte(sv, i - 1);
- }
- rangestart = -1;
- }
- }
+ (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
+ ? ANYOF_BITMAP(o)
+ : TRIE_BITMAP(trie));
sv_catpvs(sv, "]");
}
} else if (k == LOGICAL)
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
else if (k == ANYOF) {
- int i, rangestart = -1;
const U8 flags = ANYOF_FLAGS(o);
int do_sep = 0;
sv_catpvs(sv, "^");
/* output what the standard cp 0-255 bitmap matches */
- for (i = 0; i <= 256; i++) {
- if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
- if (rangestart == -1)
- rangestart = i;
- } else if (rangestart != -1) {
- if (i <= rangestart + 3)
- for (; rangestart < i; rangestart++)
- put_byte(sv, rangestart);
- else {
- put_byte(sv, rangestart);
- sv_catpvs(sv, "-");
- put_byte(sv, i - 1);
- }
- do_sep = 1;
- rangestart = -1;
- }
- }
+ do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
/* output any special charclass tests (used entirely under use locale) */
- if (ANYOF_CLASS_TEST_ANY_SET(o))
- for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
+ if (ANYOF_CLASS_TEST_ANY_SET(o)) {
+ int i;
+ for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) {
if (ANYOF_CLASS_TEST(o,i)) {
sv_catpv(sv, anyofs[i]);
do_sep = 1;
}
+ }
+ }
EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
/* output information about the unicode matching */
if (flags & ANYOF_UNICODE_ALL)
sv_catpvs(sv, "{unicode_all}");
- else if (ANYOF_NONBITMAP(o))
- sv_catpvs(sv, "{unicode}");
- if (flags & ANYOF_NONBITMAP_NON_UTF8)
- sv_catpvs(sv, "{outside bitmap}");
-
- if (ANYOF_NONBITMAP(o)) {
- SV *lv; /* Set if there is something outside the bit map */
- SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
+ else if (ANYOF_NONBITMAP(o)) {
+ SV *lv; /* Set if there is something outside the bit map. */
+ SV * sw;
bool byte_output = FALSE; /* If something in the bitmap has been
output */
- if (lv && lv != &PL_sv_undef) {
- if (sw) {
- U8 s[UTF8_MAXBYTES_CASE+1];
-
- for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
- uvchr_to_utf8(s, i);
-
- if (i < 256
- && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
- things already
- output as part
- of the bitmap */
- && swash_fetch(sw, s, TRUE))
- {
- if (rangestart == -1)
- rangestart = i;
- } else if (rangestart != -1) {
- byte_output = TRUE;
- if (i <= rangestart + 3)
- for (; rangestart < i; rangestart++) {
- put_byte(sv, rangestart);
- }
- else {
- put_byte(sv, rangestart);
- sv_catpvs(sv, "-");
- put_byte(sv, i-1);
- }
- rangestart = -1;
- }
- }
- }
+ if (flags & ANYOF_NONBITMAP_NON_UTF8) {
+ sv_catpvs(sv, "{outside bitmap}");
+ }
+ else {
+ sv_catpvs(sv, "{utf8}");
+ }
- {
- char *s = savesvpv(lv);
- char * const origs = s;
+ /* Get the stuff that wasn't in the bitmap */
+ sw = regclass_swash(prog, o, FALSE, &lv, NULL);
+ if (lv && lv != &PL_sv_undef) {
+ char *s = savesvpv(lv);
+ char * const origs = s;
- while (*s && *s != '\n')
- s++;
+ while (*s && *s != '\n')
+ s++;
- if (*s == '\n') {
- const char * const t = ++s;
+ if (*s == '\n') {
+ const char * const t = ++s;
- if (byte_output) {
- sv_catpvs(sv, " ");
- }
+ if (byte_output) {
+ sv_catpvs(sv, " ");
+ }
- while (*s) {
- if (*s == '\n') {
+ while (*s) {
+ if (*s == '\n') {
- /* Truncate very long output */
- if (s - origs > 256) {
- Perl_sv_catpvf(aTHX_ sv,
- "%.*s...",
- (int) (s - origs - 1),
- t);
- goto out_dump;
- }
- *s = ' ';
- }
- else if (*s == '\t') {
- *s = '-';
- }
- s++;
- }
- if (s[-1] == ' ')
- s[-1] = 0;
+ /* Truncate very long output */
+ if (s - origs > 256) {
+ Perl_sv_catpvf(aTHX_ sv,
+ "%.*s...",
+ (int) (s - origs - 1),
+ t);
+ goto out_dump;
+ }
+ *s = ' ';
+ }
+ else if (*s == '\t') {
+ *s = '-';
+ }
+ s++;
+ }
+ if (s[-1] == ' ')
+ s[-1] = 0;
- sv_catpv(sv, t);
- }
+ sv_catpv(sv, t);
+ }
- out_dump:
+ out_dump:
- Safefree(origs);
- }
+ Safefree(origs);
SvREFCNT_dec_NN(lv);
}
}
so we need to copy it locally. */
RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
ret->mother_re = NULL;
- ret->gofs = 0;
}
#endif /* PERL_IN_XSUB_RE */
{
dVAR;
- struct re_save_state *state;
-
- SAVEVPTR(PL_curcop);
- SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
-
- state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
- PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
- SSPUSHUV(SAVEt_RE_STATE);
-
- Copy(&PL_reg_state, state, 1, struct re_save_state);
-
- PL_reg_oldsaved = NULL;
- PL_reg_oldsavedlen = 0;
- PL_reg_oldsavedoffset = 0;
- PL_reg_oldsavedcoffset = 0;
- PL_reg_maxiter = 0;
- PL_reg_leftiter = 0;
- PL_reg_poscache = NULL;
- PL_reg_poscache_size = 0;
-#ifdef PERL_ANY_COW
- PL_nrs = NULL;
-#endif
-
/* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
if (PL_curpm) {
const REGEXP * const rx = PM_GETRE(PL_curpm);
So the old condition can be simplified to !isPRINT(c) */
if (!isPRINT(c)) {
- if (c < 256) {
- Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
- }
- else {
- Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
- }
+ switch (c) {
+ case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
+ case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
+ case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
+ case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
+ case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
+
+ default:
+ Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
+ break;
+ }
}
else {
const char string = c;
}
}
+STATIC bool
+S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
+{
+ /* Appends to 'sv' a displayable version of the innards of the bracketed
+ * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
+ * output anything */
+
+ int i;
+ int rangestart = -1;
+ bool has_output_anything = FALSE;
+
+ PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
+
+ for (i = 0; i <= 256; i++) {
+ if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
+ if (rangestart == -1)
+ rangestart = i;
+ } else if (rangestart != -1) {
+ int j = i - 1;
+ if (i <= rangestart + 3) { /* Individual chars in short ranges */
+ for (; rangestart < i; rangestart++)
+ put_byte(sv, rangestart);
+ }
+ else if ( j > 255
+ || ! isALPHANUMERIC(rangestart)
+ || ! isALPHANUMERIC(j)
+ || isDIGIT(rangestart) != isDIGIT(j)
+ || isUPPER(rangestart) != isUPPER(j)
+ || isLOWER(rangestart) != isLOWER(j)
+
+ /* This final test should get optimized out except
+ * on EBCDIC platforms, where it causes ranges that
+ * cross discontinuities like i/j to be shown as hex
+ * instead of the misleading, e.g. H-K (since that
+ * range includes more than H, I, J, K). */
+ || (j - rangestart)
+ != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
+ {
+ Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
+ rangestart,
+ (j < 256) ? j : 255);
+ }
+ else { /* Here, the ends of the range are both digits, or both
+ uppercase, or both lowercase; and there's no
+ discontinuity in the range (which could happen on EBCDIC
+ platforms) */
+ put_byte(sv, rangestart);
+ sv_catpvs(sv, "-");
+ put_byte(sv, j);
+ }
+ rangestart = -1;
+ has_output_anything = TRUE;
+ }
+ }
+
+ return has_output_anything;
+}
#define CLEAR_OPTSTART \
if (optstart) STMT_START { \